blob: 9de1e53c1dc80551754cfb2cc45dadd210d776c2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Eric Smith8c663262007-08-25 02:26:07 +000048#include "formatter_unicode.h"
49
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
56#define MAX_UNICODE_FREELIST_SIZE 1024
57
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Barry Warsaw51ac5802000-03-20 16:36:48 +000064 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Walter Dörwald16807132007-05-25 13:52:07 +000097/* This dictionary holds all interned unicode strings. Note that references
98 to strings in this dictionary are *not* counted in the string's ob_refcnt.
99 When the interned string reaches a refcnt of 0 the string deallocation
100 function will delete the reference from this dictionary.
101
102 Another way to look at this is that to say that the actual reference
103 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
104*/
105static PyObject *interned;
106
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000108static PyUnicodeObject *unicode_freelist;
109static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000111/* The empty Unicode object is shared to improve performance. */
112static PyUnicodeObject *unicode_empty;
113
114/* Single character Unicode strings in the Latin-1 range are being
115 shared as well. */
116static PyUnicodeObject *unicode_latin1[256];
117
Fred Drakee4315f52000-05-09 19:53:39 +0000118/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000119 parameter; it is fixed to "utf-8". Always use the
120 PyUnicode_GetDefaultEncoding() API to access this global. */
121static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000122
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000124PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000126#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000127 return 0x10FFFF;
128#else
129 /* This is actually an illegal character, so it should
130 not be passed to unichr. */
131 return 0xFFFF;
132#endif
133}
134
Thomas Wouters477c8d52006-05-27 19:21:47 +0000135/* --- Bloom Filters ----------------------------------------------------- */
136
137/* stuff to implement simple "bloom filters" for Unicode characters.
138 to keep things simple, we use a single bitmask, using the least 5
139 bits from each unicode characters as the bit index. */
140
141/* the linebreak mask is set up by Unicode_Init below */
142
143#define BLOOM_MASK unsigned long
144
145static BLOOM_MASK bloom_linebreak;
146
147#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
148
149#define BLOOM_LINEBREAK(ch)\
150 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
151
152Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
153{
154 /* calculate simple bloom-style bitmask for a given unicode string */
155
156 long mask;
157 Py_ssize_t i;
158
159 mask = 0;
160 for (i = 0; i < len; i++)
161 mask |= (1 << (ptr[i] & 0x1F));
162
163 return mask;
164}
165
166Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
167{
168 Py_ssize_t i;
169
170 for (i = 0; i < setlen; i++)
171 if (set[i] == chr)
172 return 1;
173
174 return 0;
175}
176
177#define BLOOM_MEMBER(mask, chr, set, setlen)\
178 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
179
Guido van Rossumd57fd912000-03-10 22:53:23 +0000180/* --- Unicode Object ----------------------------------------------------- */
181
182static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000184 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185{
186 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000187
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000191
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192 /* Resizing shared object (unicode_empty or single character
193 objects) in-place is not allowed. Use PyUnicode_Resize()
194 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000195
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000196 if (unicode == unicode_empty ||
197 (unicode->length == 1 &&
198 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 return -1;
203 }
204
Thomas Wouters477c8d52006-05-27 19:21:47 +0000205 /* We allocate one more byte to make sure the string is Ux0000 terminated.
206 The overallocation is also used by fastsearch, which assumes that it's
207 safe to look at str[length] (without making any assumptions about what
208 it contains). */
209
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 oldstr = unicode->str;
211 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
212 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000213 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 PyErr_NoMemory();
215 return -1;
216 }
217 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000222 if (unicode->defenc) {
223 Py_DECREF(unicode->defenc);
224 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000227
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 return 0;
229}
230
231/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000232 Ux0000 terminated; some code (e.g. new_identifier)
233 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234
235 XXX This allocator could further be enhanced by assuring that the
236 free list never reduces its size below 1.
237
238*/
239
240static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000241PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242{
243 register PyUnicodeObject *unicode;
244
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 if (length == 0 && unicode_empty != NULL) {
247 Py_INCREF(unicode_empty);
248 return unicode_empty;
249 }
250
251 /* Unicode freelist & memory allocation */
252 if (unicode_freelist) {
253 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000254 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000257 /* Keep-Alive optimization: we only upsize the buffer,
258 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000259 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000260 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000265 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000267 }
268 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 }
270 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000271 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 if (unicode == NULL)
273 return NULL;
274 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
275 }
276
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 if (!unicode->str) {
278 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000279 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000280 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000281 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000282 * the caller fails before initializing str -- unicode_resize()
283 * reads str[0], and the Keep-Alive optimization can keep memory
284 * allocated for str alive across a call to unicode_dealloc(unicode).
285 * We don't want unicode_resize to read uninitialized memory in
286 * that case.
287 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000288 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000290 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000292 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000293 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000295
296 onError:
297 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000298 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300}
301
302static
Guido van Rossum9475a232001-10-05 20:51:39 +0000303void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304{
Walter Dörwald16807132007-05-25 13:52:07 +0000305 switch (PyUnicode_CHECK_INTERNED(unicode)) {
306 case SSTATE_NOT_INTERNED:
307 break;
308
309 case SSTATE_INTERNED_MORTAL:
310 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000311 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000312 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
313 Py_FatalError(
314 "deletion of interned unicode string failed");
315 break;
316
317 case SSTATE_INTERNED_IMMORTAL:
318 Py_FatalError("Immortal interned unicode string died.");
319
320 default:
321 Py_FatalError("Inconsistent interned unicode string state.");
322 }
323
Guido van Rossum604ddf82001-12-06 20:03:56 +0000324 if (PyUnicode_CheckExact(unicode) &&
325 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000326 /* Keep-Alive optimization */
327 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000328 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329 unicode->str = NULL;
330 unicode->length = 0;
331 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000332 if (unicode->defenc) {
333 Py_DECREF(unicode->defenc);
334 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000335 }
336 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 *(PyUnicodeObject **)unicode = unicode_freelist;
338 unicode_freelist = unicode;
339 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 }
341 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000342 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000343 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000344 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346}
347
Martin v. Löwis18e16552006-02-15 17:27:45 +0000348int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000349{
350 register PyUnicodeObject *v;
351
352 /* Argument checks */
353 if (unicode == NULL) {
354 PyErr_BadInternalCall();
355 return -1;
356 }
357 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000358 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 PyErr_BadInternalCall();
360 return -1;
361 }
362
363 /* Resizing unicode_empty and single character objects is not
364 possible since these are being shared. We simply return a fresh
365 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000366 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000367 (v == unicode_empty || v->length == 1)) {
368 PyUnicodeObject *w = _PyUnicode_New(length);
369 if (w == NULL)
370 return -1;
371 Py_UNICODE_COPY(w->str, v->str,
372 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000373 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000374 *unicode = (PyObject *)w;
375 return 0;
376 }
377
378 /* Note that we don't have to modify *unicode for unshared Unicode
379 objects, since we can modify them in-place. */
380 return unicode_resize(v, length);
381}
382
383/* Internal API for use in unicodeobject.c only ! */
384#define _PyUnicode_Resize(unicodevar, length) \
385 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
386
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000388 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
390 PyUnicodeObject *unicode;
391
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000392 /* If the Unicode data is known at construction time, we can apply
393 some optimizations which share commonly used objects. */
394 if (u != NULL) {
395
396 /* Optimization for empty strings */
397 if (size == 0 && unicode_empty != NULL) {
398 Py_INCREF(unicode_empty);
399 return (PyObject *)unicode_empty;
400 }
401
402 /* Single character Unicode objects in the Latin-1 range are
403 shared when using this constructor */
404 if (size == 1 && *u < 256) {
405 unicode = unicode_latin1[*u];
406 if (!unicode) {
407 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 if (!unicode)
409 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000410 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000411 unicode_latin1[*u] = unicode;
412 }
413 Py_INCREF(unicode);
414 return (PyObject *)unicode;
415 }
416 }
Tim Petersced69f82003-09-16 20:30:58 +0000417
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 unicode = _PyUnicode_New(size);
419 if (!unicode)
420 return NULL;
421
422 /* Copy the Unicode data into the new object */
423 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000424 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425
426 return (PyObject *)unicode;
427}
428
Walter Dörwaldd2034312007-05-18 16:29:38 +0000429PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000430{
431 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000432 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000433 some optimizations which share commonly used objects.
434 Also, this means the input must be UTF-8, so fall back to the
435 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000436 if (u != NULL) {
437
438 /* Optimization for empty strings */
439 if (size == 0 && unicode_empty != NULL) {
440 Py_INCREF(unicode_empty);
441 return (PyObject *)unicode_empty;
442 }
443
Martin v. Löwis9c121062007-08-05 20:26:11 +0000444 /* Single characters are shared when using this constructor.
445 Restrict to ASCII, since the input must be UTF-8. */
446 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000447 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 if (!unicode) {
449 unicode = _PyUnicode_New(1);
450 if (!unicode)
451 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000452 unicode->str[0] = Py_CHARMASK(*u);
453 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000454 }
455 Py_INCREF(unicode);
456 return (PyObject *)unicode;
457 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000458
459 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000460 }
461
Walter Dörwald55507312007-05-18 13:12:10 +0000462 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000463 if (!unicode)
464 return NULL;
465
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000466 return (PyObject *)unicode;
467}
468
Walter Dörwaldd2034312007-05-18 16:29:38 +0000469PyObject *PyUnicode_FromString(const char *u)
470{
471 size_t size = strlen(u);
472 if (size > PY_SSIZE_T_MAX) {
473 PyErr_SetString(PyExc_OverflowError, "input too long");
474 return NULL;
475 }
476
477 return PyUnicode_FromStringAndSize(u, size);
478}
479
Guido van Rossumd57fd912000-03-10 22:53:23 +0000480#ifdef HAVE_WCHAR_H
481
482PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000483 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484{
485 PyUnicodeObject *unicode;
486
487 if (w == NULL) {
488 PyErr_BadInternalCall();
489 return NULL;
490 }
491
492 unicode = _PyUnicode_New(size);
493 if (!unicode)
494 return NULL;
495
496 /* Copy the wchar_t data into the new object */
497#ifdef HAVE_USABLE_WCHAR_T
498 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000499#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 {
501 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000502 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000504 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 *u++ = *w++;
506 }
507#endif
508
509 return (PyObject *)unicode;
510}
511
Walter Dörwald346737f2007-05-31 10:44:43 +0000512static void
513makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
514{
515 *fmt++ = '%';
516 if (width) {
517 if (zeropad)
518 *fmt++ = '0';
519 fmt += sprintf(fmt, "%d", width);
520 }
521 if (precision)
522 fmt += sprintf(fmt, ".%d", precision);
523 if (longflag)
524 *fmt++ = 'l';
525 else if (size_tflag) {
526 char *f = PY_FORMAT_SIZE_T;
527 while (*f)
528 *fmt++ = *f++;
529 }
530 *fmt++ = c;
531 *fmt = '\0';
532}
533
Walter Dörwaldd2034312007-05-18 16:29:38 +0000534#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
535
536PyObject *
537PyUnicode_FromFormatV(const char *format, va_list vargs)
538{
539 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000540 Py_ssize_t callcount = 0;
541 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000542 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000543 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000544 int width = 0;
545 int precision = 0;
546 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000547 const char* f;
548 Py_UNICODE *s;
549 PyObject *string;
550 /* used by sprintf */
551 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000552 /* use abuffer instead of buffer, if we need more space
553 * (which can happen if there's a format specifier with width). */
554 char *abuffer = NULL;
555 char *realbuffer;
556 Py_ssize_t abuffersize = 0;
557 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000558 const char *copy;
559
560#ifdef VA_LIST_IS_ARRAY
561 Py_MEMCPY(count, vargs, sizeof(va_list));
562#else
563#ifdef __va_copy
564 __va_copy(count, vargs);
565#else
566 count = vargs;
567#endif
568#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000569 /* step 1: count the number of %S/%R format specifications
570 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
571 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000572 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000573 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000574 ++callcount;
575 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000576 /* step 2: allocate memory for the results of
577 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000578 if (callcount) {
579 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
580 if (!callresults) {
581 PyErr_NoMemory();
582 return NULL;
583 }
584 callresult = callresults;
585 }
586 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000587 for (f = format; *f; f++) {
588 if (*f == '%') {
589 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000590 width = 0;
591 while (isdigit(Py_CHARMASK(*f)))
592 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000593 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
594 ;
595
596 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
597 * they don't affect the amount of space we reserve.
598 */
599 if ((*f == 'l' || *f == 'z') &&
600 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000601 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000602
603 switch (*f) {
604 case 'c':
605 (void)va_arg(count, int);
606 /* fall through... */
607 case '%':
608 n++;
609 break;
610 case 'd': case 'u': case 'i': case 'x':
611 (void) va_arg(count, int);
612 /* 20 bytes is enough to hold a 64-bit
613 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000614 This isn't enough for octal.
615 If a width is specified we need more
616 (which we allocate later). */
617 if (width < 20)
618 width = 20;
619 n += width;
620 if (abuffersize < width)
621 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622 break;
623 case 's':
624 n += strlen(va_arg(count, char*));
625 break;
626 case 'U':
627 {
628 PyObject *obj = va_arg(count, PyObject *);
629 assert(obj && PyUnicode_Check(obj));
630 n += PyUnicode_GET_SIZE(obj);
631 break;
632 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000633 case 'V':
634 {
635 PyObject *obj = va_arg(count, PyObject *);
636 const char *str = va_arg(count, const char *);
637 assert(obj || str);
638 assert(!obj || PyUnicode_Check(obj));
639 if (obj)
640 n += PyUnicode_GET_SIZE(obj);
641 else
642 n += strlen(str);
643 break;
644 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000645 case 'S':
646 {
647 PyObject *obj = va_arg(count, PyObject *);
648 PyObject *str;
649 assert(obj);
650 str = PyObject_Unicode(obj);
651 if (!str)
652 goto fail;
653 n += PyUnicode_GET_SIZE(str);
654 /* Remember the str and switch to the next slot */
655 *callresult++ = str;
656 break;
657 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000658 case 'R':
659 {
660 PyObject *obj = va_arg(count, PyObject *);
661 PyObject *repr;
662 assert(obj);
663 repr = PyObject_Repr(obj);
664 if (!repr)
665 goto fail;
666 n += PyUnicode_GET_SIZE(repr);
667 /* Remember the repr and switch to the next slot */
668 *callresult++ = repr;
669 break;
670 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000671 case 'p':
672 (void) va_arg(count, int);
673 /* maximum 64-bit pointer representation:
674 * 0xffffffffffffffff
675 * so 19 characters is enough.
676 * XXX I count 18 -- what's the extra for?
677 */
678 n += 19;
679 break;
680 default:
681 /* if we stumble upon an unknown
682 formatting code, copy the rest of
683 the format string to the output
684 string. (we cannot just skip the
685 code, since there's no way to know
686 what's in the argument list) */
687 n += strlen(p);
688 goto expand;
689 }
690 } else
691 n++;
692 }
693 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000694 if (abuffersize > 20) {
695 abuffer = PyMem_Malloc(abuffersize);
696 if (!abuffer) {
697 PyErr_NoMemory();
698 goto fail;
699 }
700 realbuffer = abuffer;
701 }
702 else
703 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000704 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000705 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000706 we don't have to resize the string.
707 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000708 string = PyUnicode_FromUnicode(NULL, n);
709 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000710 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711
712 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000713 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000714
715 for (f = format; *f; f++) {
716 if (*f == '%') {
717 const char* p = f++;
718 int longflag = 0;
719 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000720 zeropad = (*f == '0');
721 /* parse the width.precision part */
722 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000723 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000724 width = (width*10) + *f++ - '0';
725 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000726 if (*f == '.') {
727 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000729 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000730 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731 /* handle the long flag, but only for %ld and %lu.
732 others can be added when necessary. */
733 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
734 longflag = 1;
735 ++f;
736 }
737 /* handle the size_t flag. */
738 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
739 size_tflag = 1;
740 ++f;
741 }
742
743 switch (*f) {
744 case 'c':
745 *s++ = va_arg(vargs, int);
746 break;
747 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000748 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000749 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000750 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000751 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000752 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000753 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000754 sprintf(realbuffer, fmt, va_arg(vargs, int));
755 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000756 break;
757 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000758 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000762 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000764 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
765 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 break;
767 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000768 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
769 sprintf(realbuffer, fmt, va_arg(vargs, int));
770 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000771 break;
772 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000773 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
774 sprintf(realbuffer, fmt, va_arg(vargs, int));
775 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000776 break;
777 case 's':
778 p = va_arg(vargs, char*);
779 appendstring(p);
780 break;
781 case 'U':
782 {
783 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000784 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
785 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
786 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000787 break;
788 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000789 case 'V':
790 {
791 PyObject *obj = va_arg(vargs, PyObject *);
792 const char *str = va_arg(vargs, const char *);
793 if (obj) {
794 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
795 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
796 s += size;
797 } else {
798 appendstring(str);
799 }
800 break;
801 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000802 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000803 case 'R':
804 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000805 Py_UNICODE *ucopy;
806 Py_ssize_t usize;
807 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000808 /* unused, since we already have the result */
809 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000810 ucopy = PyUnicode_AS_UNICODE(*callresult);
811 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000812 for (upos = 0; upos<usize;)
813 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000814 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000815 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000816 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000817 ++callresult;
818 break;
819 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820 case 'p':
821 sprintf(buffer, "%p", va_arg(vargs, void*));
822 /* %p is ill-defined: ensure leading 0x. */
823 if (buffer[1] == 'X')
824 buffer[1] = 'x';
825 else if (buffer[1] != 'x') {
826 memmove(buffer+2, buffer, strlen(buffer)+1);
827 buffer[0] = '0';
828 buffer[1] = 'x';
829 }
830 appendstring(buffer);
831 break;
832 case '%':
833 *s++ = '%';
834 break;
835 default:
836 appendstring(p);
837 goto end;
838 }
839 } else
840 *s++ = *f;
841 }
842
843 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000844 if (callresults)
845 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000846 if (abuffer)
847 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000848 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
849 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000850 fail:
851 if (callresults) {
852 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000853 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000854 Py_DECREF(*callresult2);
855 ++callresult2;
856 }
857 PyMem_Free(callresults);
858 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000859 if (abuffer)
860 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000861 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000862}
863
864#undef appendstring
865
866PyObject *
867PyUnicode_FromFormat(const char *format, ...)
868{
869 PyObject* ret;
870 va_list vargs;
871
872#ifdef HAVE_STDARG_PROTOTYPES
873 va_start(vargs, format);
874#else
875 va_start(vargs);
876#endif
877 ret = PyUnicode_FromFormatV(format, vargs);
878 va_end(vargs);
879 return ret;
880}
881
Martin v. Löwis18e16552006-02-15 17:27:45 +0000882Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
883 wchar_t *w,
884 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885{
886 if (unicode == NULL) {
887 PyErr_BadInternalCall();
888 return -1;
889 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000890
891 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000893 size = PyUnicode_GET_SIZE(unicode) + 1;
894
Guido van Rossumd57fd912000-03-10 22:53:23 +0000895#ifdef HAVE_USABLE_WCHAR_T
896 memcpy(w, unicode->str, size * sizeof(wchar_t));
897#else
898 {
899 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000900 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000901 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000902 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000903 *w++ = *u++;
904 }
905#endif
906
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000907 if (size > PyUnicode_GET_SIZE(unicode))
908 return PyUnicode_GET_SIZE(unicode);
909 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000910 return size;
911}
912
913#endif
914
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000915PyObject *PyUnicode_FromOrdinal(int ordinal)
916{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000917 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000918
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919 if (ordinal < 0 || ordinal > 0x10ffff) {
920 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000921 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000922 return NULL;
923 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000924
925#ifndef Py_UNICODE_WIDE
926 if (ordinal > 0xffff) {
927 ordinal -= 0x10000;
928 s[0] = 0xD800 | (ordinal >> 10);
929 s[1] = 0xDC00 | (ordinal & 0x3FF);
930 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000931 }
932#endif
933
Hye-Shik Chang40574832004-04-06 07:24:51 +0000934 s[0] = (Py_UNICODE)ordinal;
935 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000936}
937
Guido van Rossumd57fd912000-03-10 22:53:23 +0000938PyObject *PyUnicode_FromObject(register PyObject *obj)
939{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000940 /* XXX Perhaps we should make this API an alias of
941 PyObject_Unicode() instead ?! */
942 if (PyUnicode_CheckExact(obj)) {
943 Py_INCREF(obj);
944 return obj;
945 }
946 if (PyUnicode_Check(obj)) {
947 /* For a Unicode subtype that's not a Unicode object,
948 return a true Unicode object with the same data. */
949 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
950 PyUnicode_GET_SIZE(obj));
951 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000952 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
953}
954
955PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
956 const char *encoding,
957 const char *errors)
958{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000959 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000961 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000962
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963 if (obj == NULL) {
964 PyErr_BadInternalCall();
965 return NULL;
966 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000967
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000968 if (PyUnicode_Check(obj)) {
969 PyErr_SetString(PyExc_TypeError,
970 "decoding Unicode is not supported");
971 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000972 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000973
974 /* Coerce object */
975 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000976 s = PyString_AS_STRING(obj);
977 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000978 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000979 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
980 /* Overwrite the error message with something more useful in
981 case of a TypeError. */
982 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000983 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984 "coercing to Unicode: need string or buffer, "
985 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000986 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000987 goto onError;
988 }
Tim Petersced69f82003-09-16 20:30:58 +0000989
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000990 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000991 if (len == 0) {
992 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000993 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000994 }
Tim Petersced69f82003-09-16 20:30:58 +0000995 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000996 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000997
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000998 return v;
999
1000 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001001 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001002}
1003
1004PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001005 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001006 const char *encoding,
1007 const char *errors)
1008{
1009 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001010
1011 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001012 encoding = PyUnicode_GetDefaultEncoding();
1013
1014 /* Shortcuts for common default encodings */
1015 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001016 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001017 else if (strcmp(encoding, "latin-1") == 0)
1018 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001019#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1020 else if (strcmp(encoding, "mbcs") == 0)
1021 return PyUnicode_DecodeMBCS(s, size, errors);
1022#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001023 else if (strcmp(encoding, "ascii") == 0)
1024 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025
1026 /* Decode via the codec registry */
1027 buffer = PyBuffer_FromMemory((void *)s, size);
1028 if (buffer == NULL)
1029 goto onError;
1030 unicode = PyCodec_Decode(buffer, encoding, errors);
1031 if (unicode == NULL)
1032 goto onError;
1033 if (!PyUnicode_Check(unicode)) {
1034 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001035 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001036 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037 Py_DECREF(unicode);
1038 goto onError;
1039 }
1040 Py_DECREF(buffer);
1041 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001042
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 onError:
1044 Py_XDECREF(buffer);
1045 return NULL;
1046}
1047
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001048PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1049 const char *encoding,
1050 const char *errors)
1051{
1052 PyObject *v;
1053
1054 if (!PyUnicode_Check(unicode)) {
1055 PyErr_BadArgument();
1056 goto onError;
1057 }
1058
1059 if (encoding == NULL)
1060 encoding = PyUnicode_GetDefaultEncoding();
1061
1062 /* Decode via the codec registry */
1063 v = PyCodec_Decode(unicode, encoding, errors);
1064 if (v == NULL)
1065 goto onError;
1066 return v;
1067
1068 onError:
1069 return NULL;
1070}
1071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001073 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 const char *encoding,
1075 const char *errors)
1076{
1077 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001078
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 unicode = PyUnicode_FromUnicode(s, size);
1080 if (unicode == NULL)
1081 return NULL;
1082 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1083 Py_DECREF(unicode);
1084 return v;
1085}
1086
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001087PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1088 const char *encoding,
1089 const char *errors)
1090{
1091 PyObject *v;
1092
1093 if (!PyUnicode_Check(unicode)) {
1094 PyErr_BadArgument();
1095 goto onError;
1096 }
1097
1098 if (encoding == NULL)
1099 encoding = PyUnicode_GetDefaultEncoding();
1100
1101 /* Encode via the codec registry */
1102 v = PyCodec_Encode(unicode, encoding, errors);
1103 if (v == NULL)
1104 goto onError;
1105 return v;
1106
1107 onError:
1108 return NULL;
1109}
1110
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1112 const char *encoding,
1113 const char *errors)
1114{
1115 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001116
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 if (!PyUnicode_Check(unicode)) {
1118 PyErr_BadArgument();
1119 goto onError;
1120 }
Fred Drakee4315f52000-05-09 19:53:39 +00001121
Tim Petersced69f82003-09-16 20:30:58 +00001122 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001123 encoding = PyUnicode_GetDefaultEncoding();
1124
1125 /* Shortcuts for common default encodings */
1126 if (errors == NULL) {
1127 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001128 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001129 else if (strcmp(encoding, "latin-1") == 0)
1130 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001131#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1132 else if (strcmp(encoding, "mbcs") == 0)
1133 return PyUnicode_AsMBCSString(unicode);
1134#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001135 else if (strcmp(encoding, "ascii") == 0)
1136 return PyUnicode_AsASCIIString(unicode);
1137 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138
1139 /* Encode via the codec registry */
1140 v = PyCodec_Encode(unicode, encoding, errors);
1141 if (v == NULL)
1142 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001143 if (!PyBytes_Check(v)) {
1144 if (PyString_Check(v)) {
1145 /* Old codec, turn it into bytes */
1146 PyObject *b = PyBytes_FromObject(v);
1147 Py_DECREF(v);
1148 return b;
1149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001151 "encoder did not return a bytes object "
1152 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1153 v->ob_type->tp_name,
1154 encoding ? encoding : "NULL",
1155 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 Py_DECREF(v);
1157 goto onError;
1158 }
1159 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001160
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 onError:
1162 return NULL;
1163}
1164
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001165PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1166 const char *errors)
1167{
1168 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001169 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001170 if (v)
1171 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001172 if (errors != NULL)
1173 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001174 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1175 PyUnicode_GET_SIZE(unicode),
1176 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001177 if (!b)
1178 return NULL;
1179 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1180 PyBytes_Size(b));
1181 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001182 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001183 return v;
1184}
1185
Martin v. Löwis5b222132007-06-10 09:51:05 +00001186char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001187PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001188{
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001189 PyObject *str8;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001190 if (!PyUnicode_Check(unicode)) {
1191 PyErr_BadArgument();
1192 return NULL;
1193 }
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001194 str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1195 if (str8 == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001196 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001197 if (psize != NULL)
1198 *psize = PyString_GET_SIZE(str8);
1199 return PyString_AS_STRING(str8);
1200}
1201
1202char*
1203PyUnicode_AsString(PyObject *unicode)
1204{
1205 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001206}
1207
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1209{
1210 if (!PyUnicode_Check(unicode)) {
1211 PyErr_BadArgument();
1212 goto onError;
1213 }
1214 return PyUnicode_AS_UNICODE(unicode);
1215
1216 onError:
1217 return NULL;
1218}
1219
Martin v. Löwis18e16552006-02-15 17:27:45 +00001220Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221{
1222 if (!PyUnicode_Check(unicode)) {
1223 PyErr_BadArgument();
1224 goto onError;
1225 }
1226 return PyUnicode_GET_SIZE(unicode);
1227
1228 onError:
1229 return -1;
1230}
1231
Thomas Wouters78890102000-07-22 19:25:51 +00001232const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001233{
1234 return unicode_default_encoding;
1235}
1236
1237int PyUnicode_SetDefaultEncoding(const char *encoding)
1238{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001239 if (strcmp(encoding, unicode_default_encoding) != 0) {
1240 PyErr_Format(PyExc_ValueError,
1241 "Can only set default encoding to %s",
1242 unicode_default_encoding);
1243 return -1;
1244 }
Fred Drakee4315f52000-05-09 19:53:39 +00001245 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001246}
1247
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001248/* error handling callback helper:
1249 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001250 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001251 and adjust various state variables.
1252 return 0 on success, -1 on error
1253*/
1254
1255static
1256int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1257 const char *encoding, const char *reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001258 const char **input, const char **inend, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001259 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001260{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001261 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001262
1263 PyObject *restuple = NULL;
1264 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001265 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001266 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001267 Py_ssize_t requiredsize;
1268 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001269 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001270 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001271 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001272 int res = -1;
1273
1274 if (*errorHandler == NULL) {
1275 *errorHandler = PyCodec_LookupError(errors);
1276 if (*errorHandler == NULL)
1277 goto onError;
1278 }
1279
1280 if (*exceptionObject == NULL) {
1281 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001282 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001283 if (*exceptionObject == NULL)
1284 goto onError;
1285 }
1286 else {
1287 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1288 goto onError;
1289 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1290 goto onError;
1291 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1292 goto onError;
1293 }
1294
1295 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1296 if (restuple == NULL)
1297 goto onError;
1298 if (!PyTuple_Check(restuple)) {
1299 PyErr_Format(PyExc_TypeError, &argparse[4]);
1300 goto onError;
1301 }
1302 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1303 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001304
1305 /* Copy back the bytes variables, which might have been modified by the
1306 callback */
1307 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1308 if (!inputobj)
1309 goto onError;
1310 if (!PyBytes_Check(inputobj)) {
1311 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1312 }
1313 *input = PyBytes_AS_STRING(inputobj);
1314 insize = PyBytes_GET_SIZE(inputobj);
1315 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001316 /* we can DECREF safely, as the exception has another reference,
1317 so the object won't go away. */
1318 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001319
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001320 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001321 newpos = insize+newpos;
1322 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001323 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001324 goto onError;
1325 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326
1327 /* need more space? (at least enough for what we
1328 have+the replacement+the rest of the string (starting
1329 at the new input position), so we won't have to check space
1330 when there are no errors in the rest of the string) */
1331 repptr = PyUnicode_AS_UNICODE(repunicode);
1332 repsize = PyUnicode_GET_SIZE(repunicode);
1333 requiredsize = *outpos + repsize + insize-newpos;
1334 if (requiredsize > outsize) {
1335 if (requiredsize<2*outsize)
1336 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001337 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001338 goto onError;
1339 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1340 }
1341 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001342 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001343 Py_UNICODE_COPY(*outptr, repptr, repsize);
1344 *outptr += repsize;
1345 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001347 /* we made it! */
1348 res = 0;
1349
1350 onError:
1351 Py_XDECREF(restuple);
1352 return res;
1353}
1354
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001355/* --- UTF-7 Codec -------------------------------------------------------- */
1356
1357/* see RFC2152 for details */
1358
Tim Petersced69f82003-09-16 20:30:58 +00001359static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001360char utf7_special[128] = {
1361 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1362 encoded:
1363 0 - not special
1364 1 - special
1365 2 - whitespace (optional)
1366 3 - RFC2152 Set O (optional) */
1367 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1368 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1369 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1370 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1371 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1372 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1373 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1374 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1375
1376};
1377
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001378/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1379 warnings about the comparison always being false; since
1380 utf7_special[0] is 1, we can safely make that one comparison
1381 true */
1382
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001383#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001384 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001385 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001386 (encodeO && (utf7_special[(c)] == 3)))
1387
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001388#define B64(n) \
1389 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1390#define B64CHAR(c) \
1391 (isalnum(c) || (c) == '+' || (c) == '/')
1392#define UB64(c) \
1393 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1394 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001395
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001396#define ENCODE(out, ch, bits) \
1397 while (bits >= 6) { \
1398 *out++ = B64(ch >> (bits-6)); \
1399 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001400 }
1401
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001402#define DECODE(out, ch, bits, surrogate) \
1403 while (bits >= 16) { \
1404 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1405 bits -= 16; \
1406 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001407 /* We have already generated an error for the high surrogate \
1408 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001409 surrogate = 0; \
1410 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001411 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001412 it in a 16-bit character */ \
1413 surrogate = 1; \
1414 errmsg = "code pairs are not supported"; \
1415 goto utf7Error; \
1416 } else { \
1417 *out++ = outCh; \
1418 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001419 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001420
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001421PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001422 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001423 const char *errors)
1424{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001425 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001426 Py_ssize_t startinpos;
1427 Py_ssize_t endinpos;
1428 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001429 const char *e;
1430 PyUnicodeObject *unicode;
1431 Py_UNICODE *p;
1432 const char *errmsg = "";
1433 int inShift = 0;
1434 unsigned int bitsleft = 0;
1435 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001436 int surrogate = 0;
1437 PyObject *errorHandler = NULL;
1438 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001439
1440 unicode = _PyUnicode_New(size);
1441 if (!unicode)
1442 return NULL;
1443 if (size == 0)
1444 return (PyObject *)unicode;
1445
1446 p = unicode->str;
1447 e = s + size;
1448
1449 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001450 Py_UNICODE ch;
1451 restart:
1452 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001453
1454 if (inShift) {
1455 if ((ch == '-') || !B64CHAR(ch)) {
1456 inShift = 0;
1457 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001458
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001459 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1460 if (bitsleft >= 6) {
1461 /* The shift sequence has a partial character in it. If
1462 bitsleft < 6 then we could just classify it as padding
1463 but that is not the case here */
1464
1465 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001466 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001467 }
1468 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001469 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001470 here so indicate the potential of a misencoded character. */
1471
1472 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1473 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1474 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001475 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001476 }
1477
1478 if (ch == '-') {
1479 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001480 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001481 inShift = 1;
1482 }
1483 } else if (SPECIAL(ch,0,0)) {
1484 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001485 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001486 } else {
1487 *p++ = ch;
1488 }
1489 } else {
1490 charsleft = (charsleft << 6) | UB64(ch);
1491 bitsleft += 6;
1492 s++;
1493 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1494 }
1495 }
1496 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001497 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001498 s++;
1499 if (s < e && *s == '-') {
1500 s++;
1501 *p++ = '+';
1502 } else
1503 {
1504 inShift = 1;
1505 bitsleft = 0;
1506 }
1507 }
1508 else if (SPECIAL(ch,0,0)) {
1509 errmsg = "unexpected special character";
1510 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001511 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001512 }
1513 else {
1514 *p++ = ch;
1515 s++;
1516 }
1517 continue;
1518 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001519 outpos = p-PyUnicode_AS_UNICODE(unicode);
1520 endinpos = s-starts;
1521 if (unicode_decode_call_errorhandler(
1522 errors, &errorHandler,
1523 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001524 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 (PyObject **)&unicode, &outpos, &p))
1526 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001527 }
1528
1529 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 outpos = p-PyUnicode_AS_UNICODE(unicode);
1531 endinpos = size;
1532 if (unicode_decode_call_errorhandler(
1533 errors, &errorHandler,
1534 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001535 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001536 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001537 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001538 if (s < e)
1539 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001540 }
1541
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001542 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001543 goto onError;
1544
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 Py_XDECREF(errorHandler);
1546 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001547 return (PyObject *)unicode;
1548
1549onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001550 Py_XDECREF(errorHandler);
1551 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001552 Py_DECREF(unicode);
1553 return NULL;
1554}
1555
1556
1557PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001558 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559 int encodeSetO,
1560 int encodeWhiteSpace,
1561 const char *errors)
1562{
1563 PyObject *v;
1564 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001565 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001566 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001567 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001568 unsigned int bitsleft = 0;
1569 unsigned long charsleft = 0;
1570 char * out;
1571 char * start;
1572
1573 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001574 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001575
Walter Dörwald51ab4142007-05-05 14:43:36 +00001576 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577 if (v == NULL)
1578 return NULL;
1579
Walter Dörwald51ab4142007-05-05 14:43:36 +00001580 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581 for (;i < size; ++i) {
1582 Py_UNICODE ch = s[i];
1583
1584 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001585 if (ch == '+') {
1586 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001587 *out++ = '-';
1588 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1589 charsleft = ch;
1590 bitsleft = 16;
1591 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001592 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001594 } else {
1595 *out++ = (char) ch;
1596 }
1597 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001598 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1599 *out++ = B64(charsleft << (6-bitsleft));
1600 charsleft = 0;
1601 bitsleft = 0;
1602 /* Characters not in the BASE64 set implicitly unshift the sequence
1603 so no '-' is required, except if the character is itself a '-' */
1604 if (B64CHAR(ch) || ch == '-') {
1605 *out++ = '-';
1606 }
1607 inShift = 0;
1608 *out++ = (char) ch;
1609 } else {
1610 bitsleft += 16;
1611 charsleft = (charsleft << 16) | ch;
1612 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1613
1614 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001615 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001616 or '-' then the shift sequence will be terminated implicitly and we
1617 don't have to insert a '-'. */
1618
1619 if (bitsleft == 0) {
1620 if (i + 1 < size) {
1621 Py_UNICODE ch2 = s[i+1];
1622
1623 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001624
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001625 } else if (B64CHAR(ch2) || ch2 == '-') {
1626 *out++ = '-';
1627 inShift = 0;
1628 } else {
1629 inShift = 0;
1630 }
1631
1632 }
1633 else {
1634 *out++ = '-';
1635 inShift = 0;
1636 }
1637 }
Tim Petersced69f82003-09-16 20:30:58 +00001638 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001639 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001640 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001641 if (bitsleft) {
1642 *out++= B64(charsleft << (6-bitsleft) );
1643 *out++ = '-';
1644 }
1645
Walter Dörwald51ab4142007-05-05 14:43:36 +00001646 if (PyBytes_Resize(v, out - start)) {
1647 Py_DECREF(v);
1648 return NULL;
1649 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 return v;
1651}
1652
1653#undef SPECIAL
1654#undef B64
1655#undef B64CHAR
1656#undef UB64
1657#undef ENCODE
1658#undef DECODE
1659
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660/* --- UTF-8 Codec -------------------------------------------------------- */
1661
Tim Petersced69f82003-09-16 20:30:58 +00001662static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663char utf8_code_length[256] = {
1664 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1665 illegal prefix. see RFC 2279 for details */
1666 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1667 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1668 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1669 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1670 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1671 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1672 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1673 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1674 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1675 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1676 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1677 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1678 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1679 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1680 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1681 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1682};
1683
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001685 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 const char *errors)
1687{
Walter Dörwald69652032004-09-07 20:24:22 +00001688 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1689}
1690
1691PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001692 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001693 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001694 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001695{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001696 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001697 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001698 Py_ssize_t startinpos;
1699 Py_ssize_t endinpos;
1700 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701 const char *e;
1702 PyUnicodeObject *unicode;
1703 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001704 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001705 PyObject *errorHandler = NULL;
1706 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707
1708 /* Note: size will always be longer than the resulting Unicode
1709 character count */
1710 unicode = _PyUnicode_New(size);
1711 if (!unicode)
1712 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001713 if (size == 0) {
1714 if (consumed)
1715 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001717 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718
1719 /* Unpack UTF-8 encoded data */
1720 p = unicode->str;
1721 e = s + size;
1722
1723 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001724 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725
1726 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001727 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728 s++;
1729 continue;
1730 }
1731
1732 n = utf8_code_length[ch];
1733
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001734 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001735 if (consumed)
1736 break;
1737 else {
1738 errmsg = "unexpected end of data";
1739 startinpos = s-starts;
1740 endinpos = size;
1741 goto utf8Error;
1742 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001744
1745 switch (n) {
1746
1747 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001748 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001749 startinpos = s-starts;
1750 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001751 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001752
1753 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001754 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001755 startinpos = s-starts;
1756 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001757 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758
1759 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001760 if ((s[1] & 0xc0) != 0x80) {
1761 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001762 startinpos = s-starts;
1763 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001764 goto utf8Error;
1765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001767 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001768 startinpos = s-starts;
1769 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001770 errmsg = "illegal encoding";
1771 goto utf8Error;
1772 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001774 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775 break;
1776
1777 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001778 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001779 (s[2] & 0xc0) != 0x80) {
1780 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 startinpos = s-starts;
1782 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001783 goto utf8Error;
1784 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001786 if (ch < 0x0800) {
1787 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001788 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001789
1790 XXX For wide builds (UCS-4) we should probably try
1791 to recombine the surrogates into a single code
1792 unit.
1793 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001794 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 startinpos = s-starts;
1796 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001797 goto utf8Error;
1798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001800 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001801 break;
1802
1803 case 4:
1804 if ((s[1] & 0xc0) != 0x80 ||
1805 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001806 (s[3] & 0xc0) != 0x80) {
1807 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001808 startinpos = s-starts;
1809 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001810 goto utf8Error;
1811 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001812 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1813 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1814 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001815 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001816 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001817 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001818 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001819 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001820 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001821 startinpos = s-starts;
1822 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001823 goto utf8Error;
1824 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001825#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001826 *p++ = (Py_UNICODE)ch;
1827#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001828 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001829
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001830 /* translate from 10000..10FFFF to 0..FFFF */
1831 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001832
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001833 /* high surrogate = top 10 bits added to D800 */
1834 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001835
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001836 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001837 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001838#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 break;
1840
1841 default:
1842 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001843 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 startinpos = s-starts;
1845 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001846 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001847 }
1848 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001849 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001850
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001851 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001852 outpos = p-PyUnicode_AS_UNICODE(unicode);
1853 if (unicode_decode_call_errorhandler(
1854 errors, &errorHandler,
1855 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001856 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001857 (PyObject **)&unicode, &outpos, &p))
1858 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859 }
Walter Dörwald69652032004-09-07 20:24:22 +00001860 if (consumed)
1861 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862
1863 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001864 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 goto onError;
1866
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 Py_XDECREF(errorHandler);
1868 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869 return (PyObject *)unicode;
1870
1871onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001872 Py_XDECREF(errorHandler);
1873 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001874 Py_DECREF(unicode);
1875 return NULL;
1876}
1877
Tim Peters602f7402002-04-27 18:03:26 +00001878/* Allocation strategy: if the string is short, convert into a stack buffer
1879 and allocate exactly as much space needed at the end. Else allocate the
1880 maximum possible needed (4 result bytes per Unicode character), and return
1881 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001882*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001883PyObject *
1884PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001885 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001886 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887{
Tim Peters602f7402002-04-27 18:03:26 +00001888#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001889
Martin v. Löwis18e16552006-02-15 17:27:45 +00001890 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001891 PyObject *v; /* result string object */
1892 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001893 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001894 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001895 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001896
Tim Peters602f7402002-04-27 18:03:26 +00001897 assert(s != NULL);
1898 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899
Tim Peters602f7402002-04-27 18:03:26 +00001900 if (size <= MAX_SHORT_UNICHARS) {
1901 /* Write into the stack buffer; nallocated can't overflow.
1902 * At the end, we'll allocate exactly as much heap space as it
1903 * turns out we need.
1904 */
1905 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1906 v = NULL; /* will allocate after we're done */
1907 p = stackbuf;
1908 }
1909 else {
1910 /* Overallocate on the heap, and give the excess back at the end. */
1911 nallocated = size * 4;
1912 if (nallocated / 4 != size) /* overflow! */
1913 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001914 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001915 if (v == NULL)
1916 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001917 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001918 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001919
Tim Peters602f7402002-04-27 18:03:26 +00001920 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001921 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001922
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001923 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001924 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001926
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001928 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001929 *p++ = (char)(0xc0 | (ch >> 6));
1930 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001931 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001932 else {
Tim Peters602f7402002-04-27 18:03:26 +00001933 /* Encode UCS2 Unicode ordinals */
1934 if (ch < 0x10000) {
1935 /* Special case: check for high surrogate */
1936 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1937 Py_UCS4 ch2 = s[i];
1938 /* Check for low surrogate and combine the two to
1939 form a UCS4 value */
1940 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001941 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001942 i++;
1943 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001944 }
Tim Peters602f7402002-04-27 18:03:26 +00001945 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001946 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001947 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001948 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1949 *p++ = (char)(0x80 | (ch & 0x3f));
1950 continue;
1951 }
1952encodeUCS4:
1953 /* Encode UCS4 Unicode ordinals */
1954 *p++ = (char)(0xf0 | (ch >> 18));
1955 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1956 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1957 *p++ = (char)(0x80 | (ch & 0x3f));
1958 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001960
Tim Peters602f7402002-04-27 18:03:26 +00001961 if (v == NULL) {
1962 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001963 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001964 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001965 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001966 }
1967 else {
1968 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001969 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001970 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001971 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001974
Tim Peters602f7402002-04-27 18:03:26 +00001975#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976}
1977
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1979{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980 if (!PyUnicode_Check(unicode)) {
1981 PyErr_BadArgument();
1982 return NULL;
1983 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001984 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1985 PyUnicode_GET_SIZE(unicode),
1986 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987}
1988
Walter Dörwald41980ca2007-08-16 21:55:45 +00001989/* --- UTF-32 Codec ------------------------------------------------------- */
1990
1991PyObject *
1992PyUnicode_DecodeUTF32(const char *s,
1993 Py_ssize_t size,
1994 const char *errors,
1995 int *byteorder)
1996{
1997 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
1998}
1999
2000PyObject *
2001PyUnicode_DecodeUTF32Stateful(const char *s,
2002 Py_ssize_t size,
2003 const char *errors,
2004 int *byteorder,
2005 Py_ssize_t *consumed)
2006{
2007 const char *starts = s;
2008 Py_ssize_t startinpos;
2009 Py_ssize_t endinpos;
2010 Py_ssize_t outpos;
2011 PyUnicodeObject *unicode;
2012 Py_UNICODE *p;
2013#ifndef Py_UNICODE_WIDE
2014 int i, pairs;
2015#else
2016 const int pairs = 0;
2017#endif
2018 const unsigned char *q, *e;
2019 int bo = 0; /* assume native ordering by default */
2020 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002021 /* Offsets from q for retrieving bytes in the right order. */
2022#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2023 int iorder[] = {0, 1, 2, 3};
2024#else
2025 int iorder[] = {3, 2, 1, 0};
2026#endif
2027 PyObject *errorHandler = NULL;
2028 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002029 /* On narrow builds we split characters outside the BMP into two
2030 codepoints => count how much extra space we need. */
2031#ifndef Py_UNICODE_WIDE
2032 for (i = pairs = 0; i < size/4; i++)
2033 if (((Py_UCS4 *)s)[i] >= 0x10000)
2034 pairs++;
2035#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002036
2037 /* This might be one to much, because of a BOM */
2038 unicode = _PyUnicode_New((size+3)/4+pairs);
2039 if (!unicode)
2040 return NULL;
2041 if (size == 0)
2042 return (PyObject *)unicode;
2043
2044 /* Unpack UTF-32 encoded data */
2045 p = unicode->str;
2046 q = (unsigned char *)s;
2047 e = q + size;
2048
2049 if (byteorder)
2050 bo = *byteorder;
2051
2052 /* Check for BOM marks (U+FEFF) in the input and adjust current
2053 byte order setting accordingly. In native mode, the leading BOM
2054 mark is skipped, in all other modes, it is copied to the output
2055 stream as-is (giving a ZWNBSP character). */
2056 if (bo == 0) {
2057 if (size >= 4) {
2058 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2059 (q[iorder[1]] << 8) | q[iorder[0]];
2060#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2061 if (bom == 0x0000FEFF) {
2062 q += 4;
2063 bo = -1;
2064 }
2065 else if (bom == 0xFFFE0000) {
2066 q += 4;
2067 bo = 1;
2068 }
2069#else
2070 if (bom == 0x0000FEFF) {
2071 q += 4;
2072 bo = 1;
2073 }
2074 else if (bom == 0xFFFE0000) {
2075 q += 4;
2076 bo = -1;
2077 }
2078#endif
2079 }
2080 }
2081
2082 if (bo == -1) {
2083 /* force LE */
2084 iorder[0] = 0;
2085 iorder[1] = 1;
2086 iorder[2] = 2;
2087 iorder[3] = 3;
2088 }
2089 else if (bo == 1) {
2090 /* force BE */
2091 iorder[0] = 3;
2092 iorder[1] = 2;
2093 iorder[2] = 1;
2094 iorder[3] = 0;
2095 }
2096
2097 while (q < e) {
2098 Py_UCS4 ch;
2099 /* remaining bytes at the end? (size should be divisible by 4) */
2100 if (e-q<4) {
2101 if (consumed)
2102 break;
2103 errmsg = "truncated data";
2104 startinpos = ((const char *)q)-starts;
2105 endinpos = ((const char *)e)-starts;
2106 goto utf32Error;
2107 /* The remaining input chars are ignored if the callback
2108 chooses to skip the input */
2109 }
2110 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2111 (q[iorder[1]] << 8) | q[iorder[0]];
2112
2113 if (ch >= 0x110000)
2114 {
2115 errmsg = "codepoint not in range(0x110000)";
2116 startinpos = ((const char *)q)-starts;
2117 endinpos = startinpos+4;
2118 goto utf32Error;
2119 }
2120#ifndef Py_UNICODE_WIDE
2121 if (ch >= 0x10000)
2122 {
2123 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2124 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2125 }
2126 else
2127#endif
2128 *p++ = ch;
2129 q += 4;
2130 continue;
2131 utf32Error:
2132 outpos = p-PyUnicode_AS_UNICODE(unicode);
2133 if (unicode_decode_call_errorhandler(
2134 errors, &errorHandler,
2135 "utf32", errmsg,
2136 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2137 (PyObject **)&unicode, &outpos, &p))
2138 goto onError;
2139 }
2140
2141 if (byteorder)
2142 *byteorder = bo;
2143
2144 if (consumed)
2145 *consumed = (const char *)q-starts;
2146
2147 /* Adjust length */
2148 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2149 goto onError;
2150
2151 Py_XDECREF(errorHandler);
2152 Py_XDECREF(exc);
2153 return (PyObject *)unicode;
2154
2155onError:
2156 Py_DECREF(unicode);
2157 Py_XDECREF(errorHandler);
2158 Py_XDECREF(exc);
2159 return NULL;
2160}
2161
2162PyObject *
2163PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2164 Py_ssize_t size,
2165 const char *errors,
2166 int byteorder)
2167{
2168 PyObject *v;
2169 unsigned char *p;
2170#ifndef Py_UNICODE_WIDE
2171 int i, pairs;
2172#else
2173 const int pairs = 0;
2174#endif
2175 /* Offsets from p for storing byte pairs in the right order. */
2176#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2177 int iorder[] = {0, 1, 2, 3};
2178#else
2179 int iorder[] = {3, 2, 1, 0};
2180#endif
2181
2182#define STORECHAR(CH) \
2183 do { \
2184 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2185 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2186 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2187 p[iorder[0]] = (CH) & 0xff; \
2188 p += 4; \
2189 } while(0)
2190
2191 /* In narrow builds we can output surrogate pairs as one codepoint,
2192 so we need less space. */
2193#ifndef Py_UNICODE_WIDE
2194 for (i = pairs = 0; i < size-1; i++)
2195 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2196 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2197 pairs++;
2198#endif
2199 v = PyBytes_FromStringAndSize(NULL,
2200 4 * (size - pairs + (byteorder == 0)));
2201 if (v == NULL)
2202 return NULL;
2203
2204 p = (unsigned char *)PyBytes_AS_STRING(v);
2205 if (byteorder == 0)
2206 STORECHAR(0xFEFF);
2207 if (size == 0)
2208 return v;
2209
2210 if (byteorder == -1) {
2211 /* force LE */
2212 iorder[0] = 0;
2213 iorder[1] = 1;
2214 iorder[2] = 2;
2215 iorder[3] = 3;
2216 }
2217 else if (byteorder == 1) {
2218 /* force BE */
2219 iorder[0] = 3;
2220 iorder[1] = 2;
2221 iorder[2] = 1;
2222 iorder[3] = 0;
2223 }
2224
2225 while (size-- > 0) {
2226 Py_UCS4 ch = *s++;
2227#ifndef Py_UNICODE_WIDE
2228 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2229 Py_UCS4 ch2 = *s;
2230 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2231 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2232 s++;
2233 size--;
2234 }
2235 }
2236#endif
2237 STORECHAR(ch);
2238 }
2239 return v;
2240#undef STORECHAR
2241}
2242
2243PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2244{
2245 if (!PyUnicode_Check(unicode)) {
2246 PyErr_BadArgument();
2247 return NULL;
2248 }
2249 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2250 PyUnicode_GET_SIZE(unicode),
2251 NULL,
2252 0);
2253}
2254
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255/* --- UTF-16 Codec ------------------------------------------------------- */
2256
Tim Peters772747b2001-08-09 22:21:55 +00002257PyObject *
2258PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002259 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002260 const char *errors,
2261 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262{
Walter Dörwald69652032004-09-07 20:24:22 +00002263 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2264}
2265
2266PyObject *
2267PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002268 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002269 const char *errors,
2270 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002271 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002272{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002273 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002274 Py_ssize_t startinpos;
2275 Py_ssize_t endinpos;
2276 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277 PyUnicodeObject *unicode;
2278 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002279 const unsigned char *q, *e;
2280 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002281 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002282 /* Offsets from q for retrieving byte pairs in the right order. */
2283#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2284 int ihi = 1, ilo = 0;
2285#else
2286 int ihi = 0, ilo = 1;
2287#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002288 PyObject *errorHandler = NULL;
2289 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290
2291 /* Note: size will always be longer than the resulting Unicode
2292 character count */
2293 unicode = _PyUnicode_New(size);
2294 if (!unicode)
2295 return NULL;
2296 if (size == 0)
2297 return (PyObject *)unicode;
2298
2299 /* Unpack UTF-16 encoded data */
2300 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002301 q = (unsigned char *)s;
2302 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303
2304 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002305 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002307 /* Check for BOM marks (U+FEFF) in the input and adjust current
2308 byte order setting accordingly. In native mode, the leading BOM
2309 mark is skipped, in all other modes, it is copied to the output
2310 stream as-is (giving a ZWNBSP character). */
2311 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002312 if (size >= 2) {
2313 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002314#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002315 if (bom == 0xFEFF) {
2316 q += 2;
2317 bo = -1;
2318 }
2319 else if (bom == 0xFFFE) {
2320 q += 2;
2321 bo = 1;
2322 }
Tim Petersced69f82003-09-16 20:30:58 +00002323#else
Walter Dörwald69652032004-09-07 20:24:22 +00002324 if (bom == 0xFEFF) {
2325 q += 2;
2326 bo = 1;
2327 }
2328 else if (bom == 0xFFFE) {
2329 q += 2;
2330 bo = -1;
2331 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002332#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002333 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002334 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335
Tim Peters772747b2001-08-09 22:21:55 +00002336 if (bo == -1) {
2337 /* force LE */
2338 ihi = 1;
2339 ilo = 0;
2340 }
2341 else if (bo == 1) {
2342 /* force BE */
2343 ihi = 0;
2344 ilo = 1;
2345 }
2346
2347 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002348 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002349 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002350 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002351 if (consumed)
2352 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002353 errmsg = "truncated data";
2354 startinpos = ((const char *)q)-starts;
2355 endinpos = ((const char *)e)-starts;
2356 goto utf16Error;
2357 /* The remaining input chars are ignored if the callback
2358 chooses to skip the input */
2359 }
2360 ch = (q[ihi] << 8) | q[ilo];
2361
Tim Peters772747b2001-08-09 22:21:55 +00002362 q += 2;
2363
Guido van Rossumd57fd912000-03-10 22:53:23 +00002364 if (ch < 0xD800 || ch > 0xDFFF) {
2365 *p++ = ch;
2366 continue;
2367 }
2368
2369 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002370 if (q >= e) {
2371 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002372 startinpos = (((const char *)q)-2)-starts;
2373 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002374 goto utf16Error;
2375 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002376 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002377 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2378 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002379 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002380#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002381 *p++ = ch;
2382 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002383#else
2384 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002385#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002386 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002387 }
2388 else {
2389 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002390 startinpos = (((const char *)q)-4)-starts;
2391 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002392 goto utf16Error;
2393 }
2394
Guido van Rossumd57fd912000-03-10 22:53:23 +00002395 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002396 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002397 startinpos = (((const char *)q)-2)-starts;
2398 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002399 /* Fall through to report the error */
2400
2401 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002402 outpos = p-PyUnicode_AS_UNICODE(unicode);
2403 if (unicode_decode_call_errorhandler(
2404 errors, &errorHandler,
2405 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002406 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002407 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002408 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002409 }
2410
2411 if (byteorder)
2412 *byteorder = bo;
2413
Walter Dörwald69652032004-09-07 20:24:22 +00002414 if (consumed)
2415 *consumed = (const char *)q-starts;
2416
Guido van Rossumd57fd912000-03-10 22:53:23 +00002417 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002418 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002419 goto onError;
2420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002421 Py_XDECREF(errorHandler);
2422 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423 return (PyObject *)unicode;
2424
2425onError:
2426 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002427 Py_XDECREF(errorHandler);
2428 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429 return NULL;
2430}
2431
Tim Peters772747b2001-08-09 22:21:55 +00002432PyObject *
2433PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002434 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002435 const char *errors,
2436 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002437{
2438 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002439 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002440#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002441 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002442#else
2443 const int pairs = 0;
2444#endif
Tim Peters772747b2001-08-09 22:21:55 +00002445 /* Offsets from p for storing byte pairs in the right order. */
2446#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2447 int ihi = 1, ilo = 0;
2448#else
2449 int ihi = 0, ilo = 1;
2450#endif
2451
2452#define STORECHAR(CH) \
2453 do { \
2454 p[ihi] = ((CH) >> 8) & 0xff; \
2455 p[ilo] = (CH) & 0xff; \
2456 p += 2; \
2457 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002459#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002460 for (i = pairs = 0; i < size; i++)
2461 if (s[i] >= 0x10000)
2462 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002463#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002464 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002465 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 if (v == NULL)
2467 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468
Walter Dörwald3cc34522007-05-04 10:48:27 +00002469 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002471 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002472 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002473 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002474
2475 if (byteorder == -1) {
2476 /* force LE */
2477 ihi = 1;
2478 ilo = 0;
2479 }
2480 else if (byteorder == 1) {
2481 /* force BE */
2482 ihi = 0;
2483 ilo = 1;
2484 }
2485
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002486 while (size-- > 0) {
2487 Py_UNICODE ch = *s++;
2488 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002489#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002490 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002491 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2492 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002494#endif
Tim Peters772747b2001-08-09 22:21:55 +00002495 STORECHAR(ch);
2496 if (ch2)
2497 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002498 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002500#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501}
2502
2503PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2504{
2505 if (!PyUnicode_Check(unicode)) {
2506 PyErr_BadArgument();
2507 return NULL;
2508 }
2509 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2510 PyUnicode_GET_SIZE(unicode),
2511 NULL,
2512 0);
2513}
2514
2515/* --- Unicode Escape Codec ----------------------------------------------- */
2516
Fredrik Lundh06d12682001-01-24 07:59:11 +00002517static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002518
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002520 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 const char *errors)
2522{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002523 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002524 Py_ssize_t startinpos;
2525 Py_ssize_t endinpos;
2526 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002527 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002529 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002531 char* message;
2532 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002533 PyObject *errorHandler = NULL;
2534 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002535
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536 /* Escaped strings will always be longer than the resulting
2537 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 length after conversion to the true value.
2539 (but if the error callback returns a long replacement string
2540 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541 v = _PyUnicode_New(size);
2542 if (v == NULL)
2543 goto onError;
2544 if (size == 0)
2545 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002546
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002547 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002548 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002549
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 while (s < end) {
2551 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002552 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002553 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554
2555 /* Non-escape characters are interpreted as Unicode ordinals */
2556 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002557 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558 continue;
2559 }
2560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562 /* \ - Escapes */
2563 s++;
2564 switch (*s++) {
2565
2566 /* \x escapes */
2567 case '\n': break;
2568 case '\\': *p++ = '\\'; break;
2569 case '\'': *p++ = '\''; break;
2570 case '\"': *p++ = '\"'; break;
2571 case 'b': *p++ = '\b'; break;
2572 case 'f': *p++ = '\014'; break; /* FF */
2573 case 't': *p++ = '\t'; break;
2574 case 'n': *p++ = '\n'; break;
2575 case 'r': *p++ = '\r'; break;
2576 case 'v': *p++ = '\013'; break; /* VT */
2577 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2578
2579 /* \OOO (octal) escapes */
2580 case '0': case '1': case '2': case '3':
2581 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002582 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002584 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002586 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002588 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589 break;
2590
Fredrik Lundhccc74732001-02-18 22:13:49 +00002591 /* hex escapes */
2592 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002594 digits = 2;
2595 message = "truncated \\xXX escape";
2596 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597
Fredrik Lundhccc74732001-02-18 22:13:49 +00002598 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002600 digits = 4;
2601 message = "truncated \\uXXXX escape";
2602 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603
Fredrik Lundhccc74732001-02-18 22:13:49 +00002604 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002605 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002606 digits = 8;
2607 message = "truncated \\UXXXXXXXX escape";
2608 hexescape:
2609 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 outpos = p-PyUnicode_AS_UNICODE(v);
2611 if (s+digits>end) {
2612 endinpos = size;
2613 if (unicode_decode_call_errorhandler(
2614 errors, &errorHandler,
2615 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002616 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002617 (PyObject **)&v, &outpos, &p))
2618 goto onError;
2619 goto nextByte;
2620 }
2621 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002622 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002623 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002624 endinpos = (s+i+1)-starts;
2625 if (unicode_decode_call_errorhandler(
2626 errors, &errorHandler,
2627 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002628 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002629 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002630 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002631 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002632 }
2633 chr = (chr<<4) & ~0xF;
2634 if (c >= '0' && c <= '9')
2635 chr += c - '0';
2636 else if (c >= 'a' && c <= 'f')
2637 chr += 10 + c - 'a';
2638 else
2639 chr += 10 + c - 'A';
2640 }
2641 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002642 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002643 /* _decoding_error will have already written into the
2644 target buffer. */
2645 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002646 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002647 /* when we get here, chr is a 32-bit unicode character */
2648 if (chr <= 0xffff)
2649 /* UCS-2 character */
2650 *p++ = (Py_UNICODE) chr;
2651 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002652 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002653 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002654#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002655 *p++ = chr;
2656#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002657 chr -= 0x10000L;
2658 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002659 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002660#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002661 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002662 endinpos = s-starts;
2663 outpos = p-PyUnicode_AS_UNICODE(v);
2664 if (unicode_decode_call_errorhandler(
2665 errors, &errorHandler,
2666 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002667 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002668 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002669 goto onError;
2670 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002671 break;
2672
2673 /* \N{name} */
2674 case 'N':
2675 message = "malformed \\N character escape";
2676 if (ucnhash_CAPI == NULL) {
2677 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002678 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002679 m = PyImport_ImportModule("unicodedata");
2680 if (m == NULL)
2681 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002682 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002683 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002684 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002685 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002686 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002687 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002688 if (ucnhash_CAPI == NULL)
2689 goto ucnhashError;
2690 }
2691 if (*s == '{') {
2692 const char *start = s+1;
2693 /* look for the closing brace */
2694 while (*s != '}' && s < end)
2695 s++;
2696 if (s > start && s < end && *s == '}') {
2697 /* found a name. look it up in the unicode database */
2698 message = "unknown Unicode character name";
2699 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002700 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002701 goto store;
2702 }
2703 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002704 endinpos = s-starts;
2705 outpos = p-PyUnicode_AS_UNICODE(v);
2706 if (unicode_decode_call_errorhandler(
2707 errors, &errorHandler,
2708 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002709 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002710 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002711 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002712 break;
2713
2714 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002715 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002716 message = "\\ at end of string";
2717 s--;
2718 endinpos = s-starts;
2719 outpos = p-PyUnicode_AS_UNICODE(v);
2720 if (unicode_decode_call_errorhandler(
2721 errors, &errorHandler,
2722 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002723 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002725 goto onError;
2726 }
2727 else {
2728 *p++ = '\\';
2729 *p++ = (unsigned char)s[-1];
2730 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002731 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 nextByte:
2734 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002736 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002737 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002738 Py_XDECREF(errorHandler);
2739 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002741
Fredrik Lundhccc74732001-02-18 22:13:49 +00002742ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002743 PyErr_SetString(
2744 PyExc_UnicodeError,
2745 "\\N escapes not supported (can't load unicodedata module)"
2746 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002747 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 Py_XDECREF(errorHandler);
2749 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002750 return NULL;
2751
Fredrik Lundhccc74732001-02-18 22:13:49 +00002752onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002754 Py_XDECREF(errorHandler);
2755 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 return NULL;
2757}
2758
2759/* Return a Unicode-Escape string version of the Unicode object.
2760
2761 If quotes is true, the string is enclosed in u"" or u'' quotes as
2762 appropriate.
2763
2764*/
2765
Thomas Wouters477c8d52006-05-27 19:21:47 +00002766Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2767 Py_ssize_t size,
2768 Py_UNICODE ch)
2769{
2770 /* like wcschr, but doesn't stop at NULL characters */
2771
2772 while (size-- > 0) {
2773 if (*s == ch)
2774 return s;
2775 s++;
2776 }
2777
2778 return NULL;
2779}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002780
Walter Dörwald79e913e2007-05-12 11:08:06 +00002781static const char *hexdigits = "0123456789abcdef";
2782
2783PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2784 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785{
2786 PyObject *repr;
2787 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788
Thomas Wouters89f507f2006-12-13 04:49:30 +00002789 /* XXX(nnorwitz): rather than over-allocating, it would be
2790 better to choose a different scheme. Perhaps scan the
2791 first N-chars of the string and allocate based on that size.
2792 */
2793 /* Initial allocation is based on the longest-possible unichr
2794 escape.
2795
2796 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2797 unichr, so in this case it's the longest unichr escape. In
2798 narrow (UTF-16) builds this is five chars per source unichr
2799 since there are two unichrs in the surrogate pair, so in narrow
2800 (UTF-16) builds it's not the longest unichr escape.
2801
2802 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2803 so in the narrow (UTF-16) build case it's the longest unichr
2804 escape.
2805 */
2806
Walter Dörwald79e913e2007-05-12 11:08:06 +00002807 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002808#ifdef Py_UNICODE_WIDE
2809 + 10*size
2810#else
2811 + 6*size
2812#endif
2813 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 if (repr == NULL)
2815 return NULL;
2816
Walter Dörwald79e913e2007-05-12 11:08:06 +00002817 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 while (size-- > 0) {
2820 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002821
Walter Dörwald79e913e2007-05-12 11:08:06 +00002822 /* Escape backslashes */
2823 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 *p++ = '\\';
2825 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002826 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002827 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002828
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002829#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002830 /* Map 21-bit characters to '\U00xxxxxx' */
2831 else if (ch >= 0x10000) {
2832 *p++ = '\\';
2833 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002834 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2835 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2836 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2837 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2838 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2839 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2840 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2841 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002842 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002843 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002844#else
2845 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002846 else if (ch >= 0xD800 && ch < 0xDC00) {
2847 Py_UNICODE ch2;
2848 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002849
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002850 ch2 = *s++;
2851 size--;
2852 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2853 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2854 *p++ = '\\';
2855 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002856 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2857 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2858 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2859 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2860 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2861 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2862 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2863 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002864 continue;
2865 }
2866 /* Fall through: isolated surrogates are copied as-is */
2867 s--;
2868 size++;
2869 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002870#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002871
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002873 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874 *p++ = '\\';
2875 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002876 *p++ = hexdigits[(ch >> 12) & 0x000F];
2877 *p++ = hexdigits[(ch >> 8) & 0x000F];
2878 *p++ = hexdigits[(ch >> 4) & 0x000F];
2879 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002881
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002882 /* Map special whitespace to '\t', \n', '\r' */
2883 else if (ch == '\t') {
2884 *p++ = '\\';
2885 *p++ = 't';
2886 }
2887 else if (ch == '\n') {
2888 *p++ = '\\';
2889 *p++ = 'n';
2890 }
2891 else if (ch == '\r') {
2892 *p++ = '\\';
2893 *p++ = 'r';
2894 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002895
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002896 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002897 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002899 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002900 *p++ = hexdigits[(ch >> 4) & 0x000F];
2901 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002902 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002903
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 /* Copy everything else as-is */
2905 else
2906 *p++ = (char) ch;
2907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908
2909 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002910 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2911 Py_DECREF(repr);
2912 return NULL;
2913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002914 return repr;
2915}
2916
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2918{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002919 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920 if (!PyUnicode_Check(unicode)) {
2921 PyErr_BadArgument();
2922 return NULL;
2923 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002924 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2925 PyUnicode_GET_SIZE(unicode));
2926
2927 if (!s)
2928 return NULL;
2929 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2930 PyBytes_GET_SIZE(s));
2931 Py_DECREF(s);
2932 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933}
2934
2935/* --- Raw Unicode Escape Codec ------------------------------------------- */
2936
2937PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002938 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 const char *errors)
2940{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002941 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002942 Py_ssize_t startinpos;
2943 Py_ssize_t endinpos;
2944 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002946 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002947 const char *end;
2948 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002949 PyObject *errorHandler = NULL;
2950 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002951
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952 /* Escaped strings will always be longer than the resulting
2953 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002954 length after conversion to the true value. (But decoding error
2955 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 v = _PyUnicode_New(size);
2957 if (v == NULL)
2958 goto onError;
2959 if (size == 0)
2960 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002961 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002962 end = s + size;
2963 while (s < end) {
2964 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002965 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002967 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968
2969 /* Non-escape characters are interpreted as Unicode ordinals */
2970 if (*s != '\\') {
2971 *p++ = (unsigned char)*s++;
2972 continue;
2973 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002974 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975
2976 /* \u-escapes are only interpreted iff the number of leading
2977 backslashes if odd */
2978 bs = s;
2979 for (;s < end;) {
2980 if (*s != '\\')
2981 break;
2982 *p++ = (unsigned char)*s++;
2983 }
2984 if (((s - bs) & 1) == 0 ||
2985 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002986 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 continue;
2988 }
2989 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002990 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991 s++;
2992
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002993 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002994 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002995 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002998 endinpos = s-starts;
2999 if (unicode_decode_call_errorhandler(
3000 errors, &errorHandler,
3001 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003002 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003003 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003005 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006 }
3007 x = (x<<4) & ~0xF;
3008 if (c >= '0' && c <= '9')
3009 x += c - '0';
3010 else if (c >= 'a' && c <= 'f')
3011 x += 10 + c - 'a';
3012 else
3013 x += 10 + c - 'A';
3014 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003015#ifndef Py_UNICODE_WIDE
3016 if (x > 0x10000) {
3017 if (unicode_decode_call_errorhandler(
3018 errors, &errorHandler,
3019 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003020 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003021 (PyObject **)&v, &outpos, &p))
3022 goto onError;
3023 }
3024#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 *p++ = x;
3026 nextByte:
3027 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003029 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003030 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 Py_XDECREF(errorHandler);
3032 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003034
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 onError:
3036 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003037 Py_XDECREF(errorHandler);
3038 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 return NULL;
3040}
3041
3042PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003043 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044{
3045 PyObject *repr;
3046 char *p;
3047 char *q;
3048
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003049#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003050 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003051#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003052 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003053#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 if (repr == NULL)
3055 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003056 if (size == 0)
3057 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058
Walter Dörwald711005d2007-05-12 12:03:26 +00003059 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 while (size-- > 0) {
3061 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003062#ifdef Py_UNICODE_WIDE
3063 /* Map 32-bit characters to '\Uxxxxxxxx' */
3064 if (ch >= 0x10000) {
3065 *p++ = '\\';
3066 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003067 *p++ = hexdigits[(ch >> 28) & 0xf];
3068 *p++ = hexdigits[(ch >> 24) & 0xf];
3069 *p++ = hexdigits[(ch >> 20) & 0xf];
3070 *p++ = hexdigits[(ch >> 16) & 0xf];
3071 *p++ = hexdigits[(ch >> 12) & 0xf];
3072 *p++ = hexdigits[(ch >> 8) & 0xf];
3073 *p++ = hexdigits[(ch >> 4) & 0xf];
3074 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003075 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003076 else
3077#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 /* Map 16-bit characters to '\uxxxx' */
3079 if (ch >= 256) {
3080 *p++ = '\\';
3081 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003082 *p++ = hexdigits[(ch >> 12) & 0xf];
3083 *p++ = hexdigits[(ch >> 8) & 0xf];
3084 *p++ = hexdigits[(ch >> 4) & 0xf];
3085 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086 }
3087 /* Copy everything else as-is */
3088 else
3089 *p++ = (char) ch;
3090 }
3091 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003092 if (PyBytes_Resize(repr, p - q)) {
3093 Py_DECREF(repr);
3094 return NULL;
3095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 return repr;
3097}
3098
3099PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3100{
Walter Dörwald711005d2007-05-12 12:03:26 +00003101 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003103 PyErr_BadArgument();
3104 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003106 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3107 PyUnicode_GET_SIZE(unicode));
3108
3109 if (!s)
3110 return NULL;
3111 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3112 PyBytes_GET_SIZE(s));
3113 Py_DECREF(s);
3114 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115}
3116
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003117/* --- Unicode Internal Codec ------------------------------------------- */
3118
3119PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003120 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003121 const char *errors)
3122{
3123 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003124 Py_ssize_t startinpos;
3125 Py_ssize_t endinpos;
3126 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003127 PyUnicodeObject *v;
3128 Py_UNICODE *p;
3129 const char *end;
3130 const char *reason;
3131 PyObject *errorHandler = NULL;
3132 PyObject *exc = NULL;
3133
Neal Norwitzd43069c2006-01-08 01:12:10 +00003134#ifdef Py_UNICODE_WIDE
3135 Py_UNICODE unimax = PyUnicode_GetMax();
3136#endif
3137
Thomas Wouters89f507f2006-12-13 04:49:30 +00003138 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003139 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3140 if (v == NULL)
3141 goto onError;
3142 if (PyUnicode_GetSize((PyObject *)v) == 0)
3143 return (PyObject *)v;
3144 p = PyUnicode_AS_UNICODE(v);
3145 end = s + size;
3146
3147 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003148 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003149 /* We have to sanity check the raw data, otherwise doom looms for
3150 some malformed UCS-4 data. */
3151 if (
3152 #ifdef Py_UNICODE_WIDE
3153 *p > unimax || *p < 0 ||
3154 #endif
3155 end-s < Py_UNICODE_SIZE
3156 )
3157 {
3158 startinpos = s - starts;
3159 if (end-s < Py_UNICODE_SIZE) {
3160 endinpos = end-starts;
3161 reason = "truncated input";
3162 }
3163 else {
3164 endinpos = s - starts + Py_UNICODE_SIZE;
3165 reason = "illegal code point (> 0x10FFFF)";
3166 }
3167 outpos = p - PyUnicode_AS_UNICODE(v);
3168 if (unicode_decode_call_errorhandler(
3169 errors, &errorHandler,
3170 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003171 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003172 (PyObject **)&v, &outpos, &p)) {
3173 goto onError;
3174 }
3175 }
3176 else {
3177 p++;
3178 s += Py_UNICODE_SIZE;
3179 }
3180 }
3181
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003182 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003183 goto onError;
3184 Py_XDECREF(errorHandler);
3185 Py_XDECREF(exc);
3186 return (PyObject *)v;
3187
3188 onError:
3189 Py_XDECREF(v);
3190 Py_XDECREF(errorHandler);
3191 Py_XDECREF(exc);
3192 return NULL;
3193}
3194
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195/* --- Latin-1 Codec ------------------------------------------------------ */
3196
3197PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003198 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 const char *errors)
3200{
3201 PyUnicodeObject *v;
3202 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003203
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003205 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003206 Py_UNICODE r = *(unsigned char*)s;
3207 return PyUnicode_FromUnicode(&r, 1);
3208 }
3209
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 v = _PyUnicode_New(size);
3211 if (v == NULL)
3212 goto onError;
3213 if (size == 0)
3214 return (PyObject *)v;
3215 p = PyUnicode_AS_UNICODE(v);
3216 while (size-- > 0)
3217 *p++ = (unsigned char)*s++;
3218 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003219
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 onError:
3221 Py_XDECREF(v);
3222 return NULL;
3223}
3224
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003225/* create or adjust a UnicodeEncodeError */
3226static void make_encode_exception(PyObject **exceptionObject,
3227 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003228 const Py_UNICODE *unicode, Py_ssize_t size,
3229 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003230 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 if (*exceptionObject == NULL) {
3233 *exceptionObject = PyUnicodeEncodeError_Create(
3234 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235 }
3236 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003237 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3238 goto onError;
3239 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3240 goto onError;
3241 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3242 goto onError;
3243 return;
3244 onError:
3245 Py_DECREF(*exceptionObject);
3246 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 }
3248}
3249
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003250/* raises a UnicodeEncodeError */
3251static void raise_encode_exception(PyObject **exceptionObject,
3252 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003253 const Py_UNICODE *unicode, Py_ssize_t size,
3254 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003255 const char *reason)
3256{
3257 make_encode_exception(exceptionObject,
3258 encoding, unicode, size, startpos, endpos, reason);
3259 if (*exceptionObject != NULL)
3260 PyCodec_StrictErrors(*exceptionObject);
3261}
3262
3263/* error handling callback helper:
3264 build arguments, call the callback and check the arguments,
3265 put the result into newpos and return the replacement string, which
3266 has to be freed by the caller */
3267static PyObject *unicode_encode_call_errorhandler(const char *errors,
3268 PyObject **errorHandler,
3269 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003270 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3271 Py_ssize_t startpos, Py_ssize_t endpos,
3272 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003273{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003274 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275
3276 PyObject *restuple;
3277 PyObject *resunicode;
3278
3279 if (*errorHandler == NULL) {
3280 *errorHandler = PyCodec_LookupError(errors);
3281 if (*errorHandler == NULL)
3282 return NULL;
3283 }
3284
3285 make_encode_exception(exceptionObject,
3286 encoding, unicode, size, startpos, endpos, reason);
3287 if (*exceptionObject == NULL)
3288 return NULL;
3289
3290 restuple = PyObject_CallFunctionObjArgs(
3291 *errorHandler, *exceptionObject, NULL);
3292 if (restuple == NULL)
3293 return NULL;
3294 if (!PyTuple_Check(restuple)) {
3295 PyErr_Format(PyExc_TypeError, &argparse[4]);
3296 Py_DECREF(restuple);
3297 return NULL;
3298 }
3299 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3300 &resunicode, newpos)) {
3301 Py_DECREF(restuple);
3302 return NULL;
3303 }
3304 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003305 *newpos = size+*newpos;
3306 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003307 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003308 Py_DECREF(restuple);
3309 return NULL;
3310 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003311 Py_INCREF(resunicode);
3312 Py_DECREF(restuple);
3313 return resunicode;
3314}
3315
3316static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003317 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318 const char *errors,
3319 int limit)
3320{
3321 /* output object */
3322 PyObject *res;
3323 /* pointers to the beginning and end+1 of input */
3324 const Py_UNICODE *startp = p;
3325 const Py_UNICODE *endp = p + size;
3326 /* pointer to the beginning of the unencodable characters */
3327 /* const Py_UNICODE *badp = NULL; */
3328 /* pointer into the output */
3329 char *str;
3330 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003331 Py_ssize_t respos = 0;
3332 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003333 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3334 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335 PyObject *errorHandler = NULL;
3336 PyObject *exc = NULL;
3337 /* the following variable is used for caching string comparisons
3338 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3339 int known_errorHandler = -1;
3340
3341 /* allocate enough for a simple encoding without
3342 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003343 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003344 if (res == NULL)
3345 goto onError;
3346 if (size == 0)
3347 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003348 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349 ressize = size;
3350
3351 while (p<endp) {
3352 Py_UNICODE c = *p;
3353
3354 /* can we encode this? */
3355 if (c<limit) {
3356 /* no overflow check, because we know that the space is enough */
3357 *str++ = (char)c;
3358 ++p;
3359 }
3360 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003361 Py_ssize_t unicodepos = p-startp;
3362 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003363 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003364 Py_ssize_t repsize;
3365 Py_ssize_t newpos;
3366 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003367 Py_UNICODE *uni2;
3368 /* startpos for collecting unencodable chars */
3369 const Py_UNICODE *collstart = p;
3370 const Py_UNICODE *collend = p;
3371 /* find all unecodable characters */
3372 while ((collend < endp) && ((*collend)>=limit))
3373 ++collend;
3374 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3375 if (known_errorHandler==-1) {
3376 if ((errors==NULL) || (!strcmp(errors, "strict")))
3377 known_errorHandler = 1;
3378 else if (!strcmp(errors, "replace"))
3379 known_errorHandler = 2;
3380 else if (!strcmp(errors, "ignore"))
3381 known_errorHandler = 3;
3382 else if (!strcmp(errors, "xmlcharrefreplace"))
3383 known_errorHandler = 4;
3384 else
3385 known_errorHandler = 0;
3386 }
3387 switch (known_errorHandler) {
3388 case 1: /* strict */
3389 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3390 goto onError;
3391 case 2: /* replace */
3392 while (collstart++<collend)
3393 *str++ = '?'; /* fall through */
3394 case 3: /* ignore */
3395 p = collend;
3396 break;
3397 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003398 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003399 /* determine replacement size (temporarily (mis)uses p) */
3400 for (p = collstart, repsize = 0; p < collend; ++p) {
3401 if (*p<10)
3402 repsize += 2+1+1;
3403 else if (*p<100)
3404 repsize += 2+2+1;
3405 else if (*p<1000)
3406 repsize += 2+3+1;
3407 else if (*p<10000)
3408 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003409#ifndef Py_UNICODE_WIDE
3410 else
3411 repsize += 2+5+1;
3412#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 else if (*p<100000)
3414 repsize += 2+5+1;
3415 else if (*p<1000000)
3416 repsize += 2+6+1;
3417 else
3418 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003419#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003420 }
3421 requiredsize = respos+repsize+(endp-collend);
3422 if (requiredsize > ressize) {
3423 if (requiredsize<2*ressize)
3424 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003425 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003426 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003427 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 ressize = requiredsize;
3429 }
3430 /* generate replacement (temporarily (mis)uses p) */
3431 for (p = collstart; p < collend; ++p) {
3432 str += sprintf(str, "&#%d;", (int)*p);
3433 }
3434 p = collend;
3435 break;
3436 default:
3437 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3438 encoding, reason, startp, size, &exc,
3439 collstart-startp, collend-startp, &newpos);
3440 if (repunicode == NULL)
3441 goto onError;
3442 /* need more space? (at least enough for what we
3443 have+the replacement+the rest of the string, so
3444 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003445 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003446 repsize = PyUnicode_GET_SIZE(repunicode);
3447 requiredsize = respos+repsize+(endp-collend);
3448 if (requiredsize > ressize) {
3449 if (requiredsize<2*ressize)
3450 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003451 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003452 Py_DECREF(repunicode);
3453 goto onError;
3454 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003455 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456 ressize = requiredsize;
3457 }
3458 /* check if there is anything unencodable in the replacement
3459 and copy it to the output */
3460 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3461 c = *uni2;
3462 if (c >= limit) {
3463 raise_encode_exception(&exc, encoding, startp, size,
3464 unicodepos, unicodepos+1, reason);
3465 Py_DECREF(repunicode);
3466 goto onError;
3467 }
3468 *str = (char)c;
3469 }
3470 p = startp + newpos;
3471 Py_DECREF(repunicode);
3472 }
3473 }
3474 }
3475 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003476 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003477 if (respos<ressize)
3478 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003479 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003480 Py_XDECREF(errorHandler);
3481 Py_XDECREF(exc);
3482 return res;
3483
3484 onError:
3485 Py_XDECREF(res);
3486 Py_XDECREF(errorHandler);
3487 Py_XDECREF(exc);
3488 return NULL;
3489}
3490
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003492 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493 const char *errors)
3494{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496}
3497
3498PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3499{
3500 if (!PyUnicode_Check(unicode)) {
3501 PyErr_BadArgument();
3502 return NULL;
3503 }
3504 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3505 PyUnicode_GET_SIZE(unicode),
3506 NULL);
3507}
3508
3509/* --- 7-bit ASCII Codec -------------------------------------------------- */
3510
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003512 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 const char *errors)
3514{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003515 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 PyUnicodeObject *v;
3517 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003518 Py_ssize_t startinpos;
3519 Py_ssize_t endinpos;
3520 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 const char *e;
3522 PyObject *errorHandler = NULL;
3523 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003524
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003526 if (size == 1 && *(unsigned char*)s < 128) {
3527 Py_UNICODE r = *(unsigned char*)s;
3528 return PyUnicode_FromUnicode(&r, 1);
3529 }
Tim Petersced69f82003-09-16 20:30:58 +00003530
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 v = _PyUnicode_New(size);
3532 if (v == NULL)
3533 goto onError;
3534 if (size == 0)
3535 return (PyObject *)v;
3536 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 e = s + size;
3538 while (s < e) {
3539 register unsigned char c = (unsigned char)*s;
3540 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003541 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 ++s;
3543 }
3544 else {
3545 startinpos = s-starts;
3546 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003547 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 if (unicode_decode_call_errorhandler(
3549 errors, &errorHandler,
3550 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003551 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003556 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003557 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003558 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 Py_XDECREF(errorHandler);
3560 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003562
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 onError:
3564 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 Py_XDECREF(errorHandler);
3566 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567 return NULL;
3568}
3569
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003571 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572 const char *errors)
3573{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575}
3576
3577PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3578{
3579 if (!PyUnicode_Check(unicode)) {
3580 PyErr_BadArgument();
3581 return NULL;
3582 }
3583 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3584 PyUnicode_GET_SIZE(unicode),
3585 NULL);
3586}
3587
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003588#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003589
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003590/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003591
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003592#if SIZEOF_INT < SIZEOF_SSIZE_T
3593#define NEED_RETRY
3594#endif
3595
3596/* XXX This code is limited to "true" double-byte encodings, as
3597 a) it assumes an incomplete character consists of a single byte, and
3598 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3599 encodings, see IsDBCSLeadByteEx documentation. */
3600
3601static int is_dbcs_lead_byte(const char *s, int offset)
3602{
3603 const char *curr = s + offset;
3604
3605 if (IsDBCSLeadByte(*curr)) {
3606 const char *prev = CharPrev(s, curr);
3607 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3608 }
3609 return 0;
3610}
3611
3612/*
3613 * Decode MBCS string into unicode object. If 'final' is set, converts
3614 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3615 */
3616static int decode_mbcs(PyUnicodeObject **v,
3617 const char *s, /* MBCS string */
3618 int size, /* sizeof MBCS string */
3619 int final)
3620{
3621 Py_UNICODE *p;
3622 Py_ssize_t n = 0;
3623 int usize = 0;
3624
3625 assert(size >= 0);
3626
3627 /* Skip trailing lead-byte unless 'final' is set */
3628 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3629 --size;
3630
3631 /* First get the size of the result */
3632 if (size > 0) {
3633 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3634 if (usize == 0) {
3635 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3636 return -1;
3637 }
3638 }
3639
3640 if (*v == NULL) {
3641 /* Create unicode object */
3642 *v = _PyUnicode_New(usize);
3643 if (*v == NULL)
3644 return -1;
3645 }
3646 else {
3647 /* Extend unicode object */
3648 n = PyUnicode_GET_SIZE(*v);
3649 if (_PyUnicode_Resize(v, n + usize) < 0)
3650 return -1;
3651 }
3652
3653 /* Do the conversion */
3654 if (size > 0) {
3655 p = PyUnicode_AS_UNICODE(*v) + n;
3656 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3657 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3658 return -1;
3659 }
3660 }
3661
3662 return size;
3663}
3664
3665PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3666 Py_ssize_t size,
3667 const char *errors,
3668 Py_ssize_t *consumed)
3669{
3670 PyUnicodeObject *v = NULL;
3671 int done;
3672
3673 if (consumed)
3674 *consumed = 0;
3675
3676#ifdef NEED_RETRY
3677 retry:
3678 if (size > INT_MAX)
3679 done = decode_mbcs(&v, s, INT_MAX, 0);
3680 else
3681#endif
3682 done = decode_mbcs(&v, s, (int)size, !consumed);
3683
3684 if (done < 0) {
3685 Py_XDECREF(v);
3686 return NULL;
3687 }
3688
3689 if (consumed)
3690 *consumed += done;
3691
3692#ifdef NEED_RETRY
3693 if (size > INT_MAX) {
3694 s += done;
3695 size -= done;
3696 goto retry;
3697 }
3698#endif
3699
3700 return (PyObject *)v;
3701}
3702
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003703PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003704 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003705 const char *errors)
3706{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003707 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3708}
3709
3710/*
3711 * Convert unicode into string object (MBCS).
3712 * Returns 0 if succeed, -1 otherwise.
3713 */
3714static int encode_mbcs(PyObject **repr,
3715 const Py_UNICODE *p, /* unicode */
3716 int size) /* size of unicode */
3717{
3718 int mbcssize = 0;
3719 Py_ssize_t n = 0;
3720
3721 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003722
3723 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003724 if (size > 0) {
3725 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3726 if (mbcssize == 0) {
3727 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3728 return -1;
3729 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003730 }
3731
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003732 if (*repr == NULL) {
3733 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003734 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003735 if (*repr == NULL)
3736 return -1;
3737 }
3738 else {
3739 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003740 n = PyBytes_Size(*repr);
3741 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003742 return -1;
3743 }
3744
3745 /* Do the conversion */
3746 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003747 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003748 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3749 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3750 return -1;
3751 }
3752 }
3753
3754 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003755}
3756
3757PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003758 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003759 const char *errors)
3760{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003761 PyObject *repr = NULL;
3762 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003763
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003764#ifdef NEED_RETRY
3765 retry:
3766 if (size > INT_MAX)
3767 ret = encode_mbcs(&repr, p, INT_MAX);
3768 else
3769#endif
3770 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003771
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003772 if (ret < 0) {
3773 Py_XDECREF(repr);
3774 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003775 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003776
3777#ifdef NEED_RETRY
3778 if (size > INT_MAX) {
3779 p += INT_MAX;
3780 size -= INT_MAX;
3781 goto retry;
3782 }
3783#endif
3784
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003785 return repr;
3786}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003787
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003788PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3789{
3790 if (!PyUnicode_Check(unicode)) {
3791 PyErr_BadArgument();
3792 return NULL;
3793 }
3794 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3795 PyUnicode_GET_SIZE(unicode),
3796 NULL);
3797}
3798
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003799#undef NEED_RETRY
3800
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003801#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003802
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803/* --- Character Mapping Codec -------------------------------------------- */
3804
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003806 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003807 PyObject *mapping,
3808 const char *errors)
3809{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003811 Py_ssize_t startinpos;
3812 Py_ssize_t endinpos;
3813 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003814 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 PyUnicodeObject *v;
3816 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003817 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003818 PyObject *errorHandler = NULL;
3819 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003820 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003821 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003822
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 /* Default to Latin-1 */
3824 if (mapping == NULL)
3825 return PyUnicode_DecodeLatin1(s, size, errors);
3826
3827 v = _PyUnicode_New(size);
3828 if (v == NULL)
3829 goto onError;
3830 if (size == 0)
3831 return (PyObject *)v;
3832 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003833 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003834 if (PyUnicode_CheckExact(mapping)) {
3835 mapstring = PyUnicode_AS_UNICODE(mapping);
3836 maplen = PyUnicode_GET_SIZE(mapping);
3837 while (s < e) {
3838 unsigned char ch = *s;
3839 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003841 if (ch < maplen)
3842 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003844 if (x == 0xfffe) {
3845 /* undefined mapping */
3846 outpos = p-PyUnicode_AS_UNICODE(v);
3847 startinpos = s-starts;
3848 endinpos = startinpos+1;
3849 if (unicode_decode_call_errorhandler(
3850 errors, &errorHandler,
3851 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003852 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003853 (PyObject **)&v, &outpos, &p)) {
3854 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003855 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003856 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003857 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003858 *p++ = x;
3859 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003861 }
3862 else {
3863 while (s < e) {
3864 unsigned char ch = *s;
3865 PyObject *w, *x;
3866
3867 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3868 w = PyInt_FromLong((long)ch);
3869 if (w == NULL)
3870 goto onError;
3871 x = PyObject_GetItem(mapping, w);
3872 Py_DECREF(w);
3873 if (x == NULL) {
3874 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3875 /* No mapping found means: mapping is undefined. */
3876 PyErr_Clear();
3877 x = Py_None;
3878 Py_INCREF(x);
3879 } else
3880 goto onError;
3881 }
3882
3883 /* Apply mapping */
3884 if (PyInt_Check(x)) {
3885 long value = PyInt_AS_LONG(x);
3886 if (value < 0 || value > 65535) {
3887 PyErr_SetString(PyExc_TypeError,
3888 "character mapping must be in range(65536)");
3889 Py_DECREF(x);
3890 goto onError;
3891 }
3892 *p++ = (Py_UNICODE)value;
3893 }
3894 else if (x == Py_None) {
3895 /* undefined mapping */
3896 outpos = p-PyUnicode_AS_UNICODE(v);
3897 startinpos = s-starts;
3898 endinpos = startinpos+1;
3899 if (unicode_decode_call_errorhandler(
3900 errors, &errorHandler,
3901 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003902 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003903 (PyObject **)&v, &outpos, &p)) {
3904 Py_DECREF(x);
3905 goto onError;
3906 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003907 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003908 continue;
3909 }
3910 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003911 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003912
3913 if (targetsize == 1)
3914 /* 1-1 mapping */
3915 *p++ = *PyUnicode_AS_UNICODE(x);
3916
3917 else if (targetsize > 1) {
3918 /* 1-n mapping */
3919 if (targetsize > extrachars) {
3920 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003921 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3922 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003923 (targetsize << 2);
3924 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003925 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003926 if (_PyUnicode_Resize(&v,
3927 PyUnicode_GET_SIZE(v) + needed) < 0) {
3928 Py_DECREF(x);
3929 goto onError;
3930 }
3931 p = PyUnicode_AS_UNICODE(v) + oldpos;
3932 }
3933 Py_UNICODE_COPY(p,
3934 PyUnicode_AS_UNICODE(x),
3935 targetsize);
3936 p += targetsize;
3937 extrachars -= targetsize;
3938 }
3939 /* 1-0 mapping: skip the character */
3940 }
3941 else {
3942 /* wrong return value */
3943 PyErr_SetString(PyExc_TypeError,
3944 "character mapping must return integer, None or unicode");
3945 Py_DECREF(x);
3946 goto onError;
3947 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003949 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951 }
3952 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003953 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955 Py_XDECREF(errorHandler);
3956 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003958
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 Py_XDECREF(errorHandler);
3961 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962 Py_XDECREF(v);
3963 return NULL;
3964}
3965
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003966/* Charmap encoding: the lookup table */
3967
3968struct encoding_map{
3969 PyObject_HEAD
3970 unsigned char level1[32];
3971 int count2, count3;
3972 unsigned char level23[1];
3973};
3974
3975static PyObject*
3976encoding_map_size(PyObject *obj, PyObject* args)
3977{
3978 struct encoding_map *map = (struct encoding_map*)obj;
3979 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3980 128*map->count3);
3981}
3982
3983static PyMethodDef encoding_map_methods[] = {
3984 {"size", encoding_map_size, METH_NOARGS,
3985 PyDoc_STR("Return the size (in bytes) of this object") },
3986 { 0 }
3987};
3988
3989static void
3990encoding_map_dealloc(PyObject* o)
3991{
3992 PyObject_FREE(o);
3993}
3994
3995static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003996 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003997 "EncodingMap", /*tp_name*/
3998 sizeof(struct encoding_map), /*tp_basicsize*/
3999 0, /*tp_itemsize*/
4000 /* methods */
4001 encoding_map_dealloc, /*tp_dealloc*/
4002 0, /*tp_print*/
4003 0, /*tp_getattr*/
4004 0, /*tp_setattr*/
4005 0, /*tp_compare*/
4006 0, /*tp_repr*/
4007 0, /*tp_as_number*/
4008 0, /*tp_as_sequence*/
4009 0, /*tp_as_mapping*/
4010 0, /*tp_hash*/
4011 0, /*tp_call*/
4012 0, /*tp_str*/
4013 0, /*tp_getattro*/
4014 0, /*tp_setattro*/
4015 0, /*tp_as_buffer*/
4016 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4017 0, /*tp_doc*/
4018 0, /*tp_traverse*/
4019 0, /*tp_clear*/
4020 0, /*tp_richcompare*/
4021 0, /*tp_weaklistoffset*/
4022 0, /*tp_iter*/
4023 0, /*tp_iternext*/
4024 encoding_map_methods, /*tp_methods*/
4025 0, /*tp_members*/
4026 0, /*tp_getset*/
4027 0, /*tp_base*/
4028 0, /*tp_dict*/
4029 0, /*tp_descr_get*/
4030 0, /*tp_descr_set*/
4031 0, /*tp_dictoffset*/
4032 0, /*tp_init*/
4033 0, /*tp_alloc*/
4034 0, /*tp_new*/
4035 0, /*tp_free*/
4036 0, /*tp_is_gc*/
4037};
4038
4039PyObject*
4040PyUnicode_BuildEncodingMap(PyObject* string)
4041{
4042 Py_UNICODE *decode;
4043 PyObject *result;
4044 struct encoding_map *mresult;
4045 int i;
4046 int need_dict = 0;
4047 unsigned char level1[32];
4048 unsigned char level2[512];
4049 unsigned char *mlevel1, *mlevel2, *mlevel3;
4050 int count2 = 0, count3 = 0;
4051
4052 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4053 PyErr_BadArgument();
4054 return NULL;
4055 }
4056 decode = PyUnicode_AS_UNICODE(string);
4057 memset(level1, 0xFF, sizeof level1);
4058 memset(level2, 0xFF, sizeof level2);
4059
4060 /* If there isn't a one-to-one mapping of NULL to \0,
4061 or if there are non-BMP characters, we need to use
4062 a mapping dictionary. */
4063 if (decode[0] != 0)
4064 need_dict = 1;
4065 for (i = 1; i < 256; i++) {
4066 int l1, l2;
4067 if (decode[i] == 0
4068 #ifdef Py_UNICODE_WIDE
4069 || decode[i] > 0xFFFF
4070 #endif
4071 ) {
4072 need_dict = 1;
4073 break;
4074 }
4075 if (decode[i] == 0xFFFE)
4076 /* unmapped character */
4077 continue;
4078 l1 = decode[i] >> 11;
4079 l2 = decode[i] >> 7;
4080 if (level1[l1] == 0xFF)
4081 level1[l1] = count2++;
4082 if (level2[l2] == 0xFF)
4083 level2[l2] = count3++;
4084 }
4085
4086 if (count2 >= 0xFF || count3 >= 0xFF)
4087 need_dict = 1;
4088
4089 if (need_dict) {
4090 PyObject *result = PyDict_New();
4091 PyObject *key, *value;
4092 if (!result)
4093 return NULL;
4094 for (i = 0; i < 256; i++) {
4095 key = value = NULL;
4096 key = PyInt_FromLong(decode[i]);
4097 value = PyInt_FromLong(i);
4098 if (!key || !value)
4099 goto failed1;
4100 if (PyDict_SetItem(result, key, value) == -1)
4101 goto failed1;
4102 Py_DECREF(key);
4103 Py_DECREF(value);
4104 }
4105 return result;
4106 failed1:
4107 Py_XDECREF(key);
4108 Py_XDECREF(value);
4109 Py_DECREF(result);
4110 return NULL;
4111 }
4112
4113 /* Create a three-level trie */
4114 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4115 16*count2 + 128*count3 - 1);
4116 if (!result)
4117 return PyErr_NoMemory();
4118 PyObject_Init(result, &EncodingMapType);
4119 mresult = (struct encoding_map*)result;
4120 mresult->count2 = count2;
4121 mresult->count3 = count3;
4122 mlevel1 = mresult->level1;
4123 mlevel2 = mresult->level23;
4124 mlevel3 = mresult->level23 + 16*count2;
4125 memcpy(mlevel1, level1, 32);
4126 memset(mlevel2, 0xFF, 16*count2);
4127 memset(mlevel3, 0, 128*count3);
4128 count3 = 0;
4129 for (i = 1; i < 256; i++) {
4130 int o1, o2, o3, i2, i3;
4131 if (decode[i] == 0xFFFE)
4132 /* unmapped character */
4133 continue;
4134 o1 = decode[i]>>11;
4135 o2 = (decode[i]>>7) & 0xF;
4136 i2 = 16*mlevel1[o1] + o2;
4137 if (mlevel2[i2] == 0xFF)
4138 mlevel2[i2] = count3++;
4139 o3 = decode[i] & 0x7F;
4140 i3 = 128*mlevel2[i2] + o3;
4141 mlevel3[i3] = i;
4142 }
4143 return result;
4144}
4145
4146static int
4147encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4148{
4149 struct encoding_map *map = (struct encoding_map*)mapping;
4150 int l1 = c>>11;
4151 int l2 = (c>>7) & 0xF;
4152 int l3 = c & 0x7F;
4153 int i;
4154
4155#ifdef Py_UNICODE_WIDE
4156 if (c > 0xFFFF) {
4157 return -1;
4158 }
4159#endif
4160 if (c == 0)
4161 return 0;
4162 /* level 1*/
4163 i = map->level1[l1];
4164 if (i == 0xFF) {
4165 return -1;
4166 }
4167 /* level 2*/
4168 i = map->level23[16*i+l2];
4169 if (i == 0xFF) {
4170 return -1;
4171 }
4172 /* level 3 */
4173 i = map->level23[16*map->count2 + 128*i + l3];
4174 if (i == 0) {
4175 return -1;
4176 }
4177 return i;
4178}
4179
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180/* Lookup the character ch in the mapping. If the character
4181 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004182 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 PyObject *w = PyInt_FromLong((long)c);
4186 PyObject *x;
4187
4188 if (w == NULL)
4189 return NULL;
4190 x = PyObject_GetItem(mapping, w);
4191 Py_DECREF(w);
4192 if (x == NULL) {
4193 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4194 /* No mapping found means: mapping is undefined. */
4195 PyErr_Clear();
4196 x = Py_None;
4197 Py_INCREF(x);
4198 return x;
4199 } else
4200 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004202 else if (x == Py_None)
4203 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 else if (PyInt_Check(x)) {
4205 long value = PyInt_AS_LONG(x);
4206 if (value < 0 || value > 255) {
4207 PyErr_SetString(PyExc_TypeError,
4208 "character mapping must be in range(256)");
4209 Py_DECREF(x);
4210 return NULL;
4211 }
4212 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214 else if (PyString_Check(x))
4215 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004218 PyErr_Format(PyExc_TypeError,
4219 "character mapping must return integer, None or str8, not %.400s",
4220 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221 Py_DECREF(x);
4222 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004223 }
4224}
4225
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004226static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004227charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004228{
Walter Dörwald827b0552007-05-12 13:23:53 +00004229 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004230 /* exponentially overallocate to minimize reallocations */
4231 if (requiredsize < 2*outsize)
4232 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004233 if (PyBytes_Resize(outobj, requiredsize)) {
4234 Py_DECREF(outobj);
4235 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004236 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004237 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004238}
4239
4240typedef enum charmapencode_result {
4241 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4242}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004244 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 space is available. Return a new reference to the object that
4246 was put in the output buffer, or Py_None, if the mapping was undefined
4247 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004248 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004249static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004250charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004251 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004253 PyObject *rep;
4254 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004255 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004257 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004258 int res = encoding_map_lookup(c, mapping);
4259 Py_ssize_t requiredsize = *outpos+1;
4260 if (res == -1)
4261 return enc_FAILED;
4262 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004263 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004264 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004265 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004266 outstart[(*outpos)++] = (char)res;
4267 return enc_SUCCESS;
4268 }
4269
4270 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004272 return enc_EXCEPTION;
4273 else if (rep==Py_None) {
4274 Py_DECREF(rep);
4275 return enc_FAILED;
4276 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004278 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004279 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004280 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004282 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004284 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4286 }
4287 else {
4288 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004289 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4290 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004291 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004292 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004294 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004296 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297 memcpy(outstart + *outpos, repchars, repsize);
4298 *outpos += repsize;
4299 }
4300 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004301 Py_DECREF(rep);
4302 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303}
4304
4305/* handle an error in PyUnicode_EncodeCharmap
4306 Return 0 on success, -1 on error */
4307static
4308int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004309 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004311 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004312 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004313{
4314 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004315 Py_ssize_t repsize;
4316 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317 Py_UNICODE *uni2;
4318 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004319 Py_ssize_t collstartpos = *inpos;
4320 Py_ssize_t collendpos = *inpos+1;
4321 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004322 char *encoding = "charmap";
4323 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004324 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004325
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004326 /* find all unencodable characters */
4327 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004328 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004329 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004330 int res = encoding_map_lookup(p[collendpos], mapping);
4331 if (res != -1)
4332 break;
4333 ++collendpos;
4334 continue;
4335 }
4336
4337 rep = charmapencode_lookup(p[collendpos], mapping);
4338 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004339 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004340 else if (rep!=Py_None) {
4341 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004342 break;
4343 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004344 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004345 ++collendpos;
4346 }
4347 /* cache callback name lookup
4348 * (if not done yet, i.e. it's the first error) */
4349 if (*known_errorHandler==-1) {
4350 if ((errors==NULL) || (!strcmp(errors, "strict")))
4351 *known_errorHandler = 1;
4352 else if (!strcmp(errors, "replace"))
4353 *known_errorHandler = 2;
4354 else if (!strcmp(errors, "ignore"))
4355 *known_errorHandler = 3;
4356 else if (!strcmp(errors, "xmlcharrefreplace"))
4357 *known_errorHandler = 4;
4358 else
4359 *known_errorHandler = 0;
4360 }
4361 switch (*known_errorHandler) {
4362 case 1: /* strict */
4363 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4364 return -1;
4365 case 2: /* replace */
4366 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4367 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004368 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369 return -1;
4370 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004371 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4373 return -1;
4374 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375 }
4376 /* fall through */
4377 case 3: /* ignore */
4378 *inpos = collendpos;
4379 break;
4380 case 4: /* xmlcharrefreplace */
4381 /* generate replacement (temporarily (mis)uses p) */
4382 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4383 char buffer[2+29+1+1];
4384 char *cp;
4385 sprintf(buffer, "&#%d;", (int)p[collpos]);
4386 for (cp = buffer; *cp; ++cp) {
4387 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004388 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004390 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4392 return -1;
4393 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 }
4395 }
4396 *inpos = collendpos;
4397 break;
4398 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004399 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 encoding, reason, p, size, exceptionObject,
4401 collstartpos, collendpos, &newpos);
4402 if (repunicode == NULL)
4403 return -1;
4404 /* generate replacement */
4405 repsize = PyUnicode_GET_SIZE(repunicode);
4406 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4407 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004408 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 return -1;
4410 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004411 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4414 return -1;
4415 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416 }
4417 *inpos = newpos;
4418 Py_DECREF(repunicode);
4419 }
4420 return 0;
4421}
4422
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004424 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425 PyObject *mapping,
4426 const char *errors)
4427{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428 /* output object */
4429 PyObject *res = NULL;
4430 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004431 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004433 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 PyObject *errorHandler = NULL;
4435 PyObject *exc = NULL;
4436 /* the following variable is used for caching string comparisons
4437 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4438 * 3=ignore, 4=xmlcharrefreplace */
4439 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440
4441 /* Default to Latin-1 */
4442 if (mapping == NULL)
4443 return PyUnicode_EncodeLatin1(p, size, errors);
4444
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445 /* allocate enough for a simple encoding without
4446 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004447 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448 if (res == NULL)
4449 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004450 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 while (inpos<size) {
4454 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004455 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004456 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004458 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 if (charmap_encoding_error(p, size, &inpos, mapping,
4460 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004461 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004462 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004463 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004466 else
4467 /* done with this character => adjust input position */
4468 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004472 if (respos<PyBytes_GET_SIZE(res)) {
4473 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474 goto onError;
4475 }
4476 Py_XDECREF(exc);
4477 Py_XDECREF(errorHandler);
4478 return res;
4479
4480 onError:
4481 Py_XDECREF(res);
4482 Py_XDECREF(exc);
4483 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 return NULL;
4485}
4486
4487PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4488 PyObject *mapping)
4489{
4490 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4491 PyErr_BadArgument();
4492 return NULL;
4493 }
4494 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4495 PyUnicode_GET_SIZE(unicode),
4496 mapping,
4497 NULL);
4498}
4499
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500/* create or adjust a UnicodeTranslateError */
4501static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004502 const Py_UNICODE *unicode, Py_ssize_t size,
4503 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 if (*exceptionObject == NULL) {
4507 *exceptionObject = PyUnicodeTranslateError_Create(
4508 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509 }
4510 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4512 goto onError;
4513 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4514 goto onError;
4515 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4516 goto onError;
4517 return;
4518 onError:
4519 Py_DECREF(*exceptionObject);
4520 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521 }
4522}
4523
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524/* raises a UnicodeTranslateError */
4525static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004526 const Py_UNICODE *unicode, Py_ssize_t size,
4527 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 const char *reason)
4529{
4530 make_translate_exception(exceptionObject,
4531 unicode, size, startpos, endpos, reason);
4532 if (*exceptionObject != NULL)
4533 PyCodec_StrictErrors(*exceptionObject);
4534}
4535
4536/* error handling callback helper:
4537 build arguments, call the callback and check the arguments,
4538 put the result into newpos and return the replacement string, which
4539 has to be freed by the caller */
4540static PyObject *unicode_translate_call_errorhandler(const char *errors,
4541 PyObject **errorHandler,
4542 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004543 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4544 Py_ssize_t startpos, Py_ssize_t endpos,
4545 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004547 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004549 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550 PyObject *restuple;
4551 PyObject *resunicode;
4552
4553 if (*errorHandler == NULL) {
4554 *errorHandler = PyCodec_LookupError(errors);
4555 if (*errorHandler == NULL)
4556 return NULL;
4557 }
4558
4559 make_translate_exception(exceptionObject,
4560 unicode, size, startpos, endpos, reason);
4561 if (*exceptionObject == NULL)
4562 return NULL;
4563
4564 restuple = PyObject_CallFunctionObjArgs(
4565 *errorHandler, *exceptionObject, NULL);
4566 if (restuple == NULL)
4567 return NULL;
4568 if (!PyTuple_Check(restuple)) {
4569 PyErr_Format(PyExc_TypeError, &argparse[4]);
4570 Py_DECREF(restuple);
4571 return NULL;
4572 }
4573 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004574 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004575 Py_DECREF(restuple);
4576 return NULL;
4577 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004578 if (i_newpos<0)
4579 *newpos = size+i_newpos;
4580 else
4581 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004582 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004583 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004584 Py_DECREF(restuple);
4585 return NULL;
4586 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 Py_INCREF(resunicode);
4588 Py_DECREF(restuple);
4589 return resunicode;
4590}
4591
4592/* Lookup the character ch in the mapping and put the result in result,
4593 which must be decrefed by the caller.
4594 Return 0 on success, -1 on error */
4595static
4596int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4597{
4598 PyObject *w = PyInt_FromLong((long)c);
4599 PyObject *x;
4600
4601 if (w == NULL)
4602 return -1;
4603 x = PyObject_GetItem(mapping, w);
4604 Py_DECREF(w);
4605 if (x == NULL) {
4606 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4607 /* No mapping found means: use 1:1 mapping. */
4608 PyErr_Clear();
4609 *result = NULL;
4610 return 0;
4611 } else
4612 return -1;
4613 }
4614 else if (x == Py_None) {
4615 *result = x;
4616 return 0;
4617 }
4618 else if (PyInt_Check(x)) {
4619 long value = PyInt_AS_LONG(x);
4620 long max = PyUnicode_GetMax();
4621 if (value < 0 || value > max) {
4622 PyErr_Format(PyExc_TypeError,
4623 "character mapping must be in range(0x%lx)", max+1);
4624 Py_DECREF(x);
4625 return -1;
4626 }
4627 *result = x;
4628 return 0;
4629 }
4630 else if (PyUnicode_Check(x)) {
4631 *result = x;
4632 return 0;
4633 }
4634 else {
4635 /* wrong return value */
4636 PyErr_SetString(PyExc_TypeError,
4637 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004638 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639 return -1;
4640 }
4641}
4642/* ensure that *outobj is at least requiredsize characters long,
4643if not reallocate and adjust various state variables.
4644Return 0 on success, -1 on error */
4645static
Walter Dörwald4894c302003-10-24 14:25:28 +00004646int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004647 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004649 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004650 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004652 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004654 if (requiredsize < 2 * oldsize)
4655 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004656 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 return -1;
4658 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 }
4660 return 0;
4661}
4662/* lookup the character, put the result in the output string and adjust
4663 various state variables. Return a new reference to the object that
4664 was put in the output buffer in *result, or Py_None, if the mapping was
4665 undefined (in which case no character was written).
4666 The called must decref result.
4667 Return 0 on success, -1 on error. */
4668static
Walter Dörwald4894c302003-10-24 14:25:28 +00004669int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004670 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004671 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004672{
Walter Dörwald4894c302003-10-24 14:25:28 +00004673 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004674 return -1;
4675 if (*res==NULL) {
4676 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004677 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678 }
4679 else if (*res==Py_None)
4680 ;
4681 else if (PyInt_Check(*res)) {
4682 /* no overflow check, because we know that the space is enough */
4683 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4684 }
4685 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004686 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004687 if (repsize==1) {
4688 /* no overflow check, because we know that the space is enough */
4689 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4690 }
4691 else if (repsize!=0) {
4692 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004693 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004694 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004695 repsize - 1;
4696 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004697 return -1;
4698 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4699 *outp += repsize;
4700 }
4701 }
4702 else
4703 return -1;
4704 return 0;
4705}
4706
4707PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004708 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709 PyObject *mapping,
4710 const char *errors)
4711{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 /* output object */
4713 PyObject *res = NULL;
4714 /* pointers to the beginning and end+1 of input */
4715 const Py_UNICODE *startp = p;
4716 const Py_UNICODE *endp = p + size;
4717 /* pointer into the output */
4718 Py_UNICODE *str;
4719 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004720 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721 char *reason = "character maps to <undefined>";
4722 PyObject *errorHandler = NULL;
4723 PyObject *exc = NULL;
4724 /* the following variable is used for caching string comparisons
4725 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4726 * 3=ignore, 4=xmlcharrefreplace */
4727 int known_errorHandler = -1;
4728
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 if (mapping == NULL) {
4730 PyErr_BadArgument();
4731 return NULL;
4732 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733
4734 /* allocate enough for a simple 1:1 translation without
4735 replacements, if we need more, we'll resize */
4736 res = PyUnicode_FromUnicode(NULL, size);
4737 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004738 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004740 return res;
4741 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743 while (p<endp) {
4744 /* try to encode it */
4745 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004746 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004747 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 goto onError;
4749 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004750 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751 if (x!=Py_None) /* it worked => adjust input pointer */
4752 ++p;
4753 else { /* untranslatable character */
4754 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004755 Py_ssize_t repsize;
4756 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004757 Py_UNICODE *uni2;
4758 /* startpos for collecting untranslatable chars */
4759 const Py_UNICODE *collstart = p;
4760 const Py_UNICODE *collend = p+1;
4761 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004763 /* find all untranslatable characters */
4764 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004765 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 goto onError;
4767 Py_XDECREF(x);
4768 if (x!=Py_None)
4769 break;
4770 ++collend;
4771 }
4772 /* cache callback name lookup
4773 * (if not done yet, i.e. it's the first error) */
4774 if (known_errorHandler==-1) {
4775 if ((errors==NULL) || (!strcmp(errors, "strict")))
4776 known_errorHandler = 1;
4777 else if (!strcmp(errors, "replace"))
4778 known_errorHandler = 2;
4779 else if (!strcmp(errors, "ignore"))
4780 known_errorHandler = 3;
4781 else if (!strcmp(errors, "xmlcharrefreplace"))
4782 known_errorHandler = 4;
4783 else
4784 known_errorHandler = 0;
4785 }
4786 switch (known_errorHandler) {
4787 case 1: /* strict */
4788 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4789 goto onError;
4790 case 2: /* replace */
4791 /* No need to check for space, this is a 1:1 replacement */
4792 for (coll = collstart; coll<collend; ++coll)
4793 *str++ = '?';
4794 /* fall through */
4795 case 3: /* ignore */
4796 p = collend;
4797 break;
4798 case 4: /* xmlcharrefreplace */
4799 /* generate replacement (temporarily (mis)uses p) */
4800 for (p = collstart; p < collend; ++p) {
4801 char buffer[2+29+1+1];
4802 char *cp;
4803 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004804 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4806 goto onError;
4807 for (cp = buffer; *cp; ++cp)
4808 *str++ = *cp;
4809 }
4810 p = collend;
4811 break;
4812 default:
4813 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4814 reason, startp, size, &exc,
4815 collstart-startp, collend-startp, &newpos);
4816 if (repunicode == NULL)
4817 goto onError;
4818 /* generate replacement */
4819 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004820 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4822 Py_DECREF(repunicode);
4823 goto onError;
4824 }
4825 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4826 *str++ = *uni2;
4827 p = startp + newpos;
4828 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 }
4830 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 /* Resize if we allocated to much */
4833 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004834 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004835 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004836 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004837 }
4838 Py_XDECREF(exc);
4839 Py_XDECREF(errorHandler);
4840 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842 onError:
4843 Py_XDECREF(res);
4844 Py_XDECREF(exc);
4845 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 return NULL;
4847}
4848
4849PyObject *PyUnicode_Translate(PyObject *str,
4850 PyObject *mapping,
4851 const char *errors)
4852{
4853 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004854
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 str = PyUnicode_FromObject(str);
4856 if (str == NULL)
4857 goto onError;
4858 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4859 PyUnicode_GET_SIZE(str),
4860 mapping,
4861 errors);
4862 Py_DECREF(str);
4863 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004864
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 onError:
4866 Py_XDECREF(str);
4867 return NULL;
4868}
Tim Petersced69f82003-09-16 20:30:58 +00004869
Guido van Rossum9e896b32000-04-05 20:11:21 +00004870/* --- Decimal Encoder ---------------------------------------------------- */
4871
4872int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004873 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004874 char *output,
4875 const char *errors)
4876{
4877 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004878 PyObject *errorHandler = NULL;
4879 PyObject *exc = NULL;
4880 const char *encoding = "decimal";
4881 const char *reason = "invalid decimal Unicode string";
4882 /* the following variable is used for caching string comparisons
4883 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4884 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004885
4886 if (output == NULL) {
4887 PyErr_BadArgument();
4888 return -1;
4889 }
4890
4891 p = s;
4892 end = s + length;
4893 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004894 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004895 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004897 Py_ssize_t repsize;
4898 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899 Py_UNICODE *uni2;
4900 Py_UNICODE *collstart;
4901 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004902
Guido van Rossum9e896b32000-04-05 20:11:21 +00004903 if (Py_UNICODE_ISSPACE(ch)) {
4904 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004906 continue;
4907 }
4908 decimal = Py_UNICODE_TODECIMAL(ch);
4909 if (decimal >= 0) {
4910 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004911 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004912 continue;
4913 }
Guido van Rossumba477042000-04-06 18:18:10 +00004914 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004915 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004916 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004917 continue;
4918 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004919 /* All other characters are considered unencodable */
4920 collstart = p;
4921 collend = p+1;
4922 while (collend < end) {
4923 if ((0 < *collend && *collend < 256) ||
4924 !Py_UNICODE_ISSPACE(*collend) ||
4925 Py_UNICODE_TODECIMAL(*collend))
4926 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004927 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004928 /* cache callback name lookup
4929 * (if not done yet, i.e. it's the first error) */
4930 if (known_errorHandler==-1) {
4931 if ((errors==NULL) || (!strcmp(errors, "strict")))
4932 known_errorHandler = 1;
4933 else if (!strcmp(errors, "replace"))
4934 known_errorHandler = 2;
4935 else if (!strcmp(errors, "ignore"))
4936 known_errorHandler = 3;
4937 else if (!strcmp(errors, "xmlcharrefreplace"))
4938 known_errorHandler = 4;
4939 else
4940 known_errorHandler = 0;
4941 }
4942 switch (known_errorHandler) {
4943 case 1: /* strict */
4944 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4945 goto onError;
4946 case 2: /* replace */
4947 for (p = collstart; p < collend; ++p)
4948 *output++ = '?';
4949 /* fall through */
4950 case 3: /* ignore */
4951 p = collend;
4952 break;
4953 case 4: /* xmlcharrefreplace */
4954 /* generate replacement (temporarily (mis)uses p) */
4955 for (p = collstart; p < collend; ++p)
4956 output += sprintf(output, "&#%d;", (int)*p);
4957 p = collend;
4958 break;
4959 default:
4960 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4961 encoding, reason, s, length, &exc,
4962 collstart-s, collend-s, &newpos);
4963 if (repunicode == NULL)
4964 goto onError;
4965 /* generate replacement */
4966 repsize = PyUnicode_GET_SIZE(repunicode);
4967 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4968 Py_UNICODE ch = *uni2;
4969 if (Py_UNICODE_ISSPACE(ch))
4970 *output++ = ' ';
4971 else {
4972 decimal = Py_UNICODE_TODECIMAL(ch);
4973 if (decimal >= 0)
4974 *output++ = '0' + decimal;
4975 else if (0 < ch && ch < 256)
4976 *output++ = (char)ch;
4977 else {
4978 Py_DECREF(repunicode);
4979 raise_encode_exception(&exc, encoding,
4980 s, length, collstart-s, collend-s, reason);
4981 goto onError;
4982 }
4983 }
4984 }
4985 p = s + newpos;
4986 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004987 }
4988 }
4989 /* 0-terminate the output string */
4990 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004991 Py_XDECREF(exc);
4992 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004993 return 0;
4994
4995 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004996 Py_XDECREF(exc);
4997 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004998 return -1;
4999}
5000
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001/* --- Helpers ------------------------------------------------------------ */
5002
Eric Smith8c663262007-08-25 02:26:07 +00005003#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005004
5005#include "stringlib/fastsearch.h"
5006
5007#include "stringlib/count.h"
5008#include "stringlib/find.h"
5009#include "stringlib/partition.h"
5010
5011/* helper macro to fixup start/end slice values */
5012#define FIX_START_END(obj) \
5013 if (start < 0) \
5014 start += (obj)->length; \
5015 if (start < 0) \
5016 start = 0; \
5017 if (end > (obj)->length) \
5018 end = (obj)->length; \
5019 if (end < 0) \
5020 end += (obj)->length; \
5021 if (end < 0) \
5022 end = 0;
5023
Martin v. Löwis18e16552006-02-15 17:27:45 +00005024Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005025 PyObject *substr,
5026 Py_ssize_t start,
5027 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005029 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005030 PyUnicodeObject* str_obj;
5031 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005032
Thomas Wouters477c8d52006-05-27 19:21:47 +00005033 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5034 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005036 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5037 if (!sub_obj) {
5038 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039 return -1;
5040 }
Tim Petersced69f82003-09-16 20:30:58 +00005041
Thomas Wouters477c8d52006-05-27 19:21:47 +00005042 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005043
Thomas Wouters477c8d52006-05-27 19:21:47 +00005044 result = stringlib_count(
5045 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5046 );
5047
5048 Py_DECREF(sub_obj);
5049 Py_DECREF(str_obj);
5050
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051 return result;
5052}
5053
Martin v. Löwis18e16552006-02-15 17:27:45 +00005054Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005055 PyObject *sub,
5056 Py_ssize_t start,
5057 Py_ssize_t end,
5058 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005060 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005061
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005063 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005064 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005065 sub = PyUnicode_FromObject(sub);
5066 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005067 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005068 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069 }
Tim Petersced69f82003-09-16 20:30:58 +00005070
Thomas Wouters477c8d52006-05-27 19:21:47 +00005071 if (direction > 0)
5072 result = stringlib_find_slice(
5073 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5074 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5075 start, end
5076 );
5077 else
5078 result = stringlib_rfind_slice(
5079 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5080 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5081 start, end
5082 );
5083
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005085 Py_DECREF(sub);
5086
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087 return result;
5088}
5089
Tim Petersced69f82003-09-16 20:30:58 +00005090static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091int tailmatch(PyUnicodeObject *self,
5092 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005093 Py_ssize_t start,
5094 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 int direction)
5096{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097 if (substring->length == 0)
5098 return 1;
5099
Thomas Wouters477c8d52006-05-27 19:21:47 +00005100 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101
5102 end -= substring->length;
5103 if (end < start)
5104 return 0;
5105
5106 if (direction > 0) {
5107 if (Py_UNICODE_MATCH(self, end, substring))
5108 return 1;
5109 } else {
5110 if (Py_UNICODE_MATCH(self, start, substring))
5111 return 1;
5112 }
5113
5114 return 0;
5115}
5116
Martin v. Löwis18e16552006-02-15 17:27:45 +00005117Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005119 Py_ssize_t start,
5120 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121 int direction)
5122{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005123 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005124
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125 str = PyUnicode_FromObject(str);
5126 if (str == NULL)
5127 return -1;
5128 substr = PyUnicode_FromObject(substr);
5129 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005130 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 return -1;
5132 }
Tim Petersced69f82003-09-16 20:30:58 +00005133
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 result = tailmatch((PyUnicodeObject *)str,
5135 (PyUnicodeObject *)substr,
5136 start, end, direction);
5137 Py_DECREF(str);
5138 Py_DECREF(substr);
5139 return result;
5140}
5141
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142/* Apply fixfct filter to the Unicode object self and return a
5143 reference to the modified object */
5144
Tim Petersced69f82003-09-16 20:30:58 +00005145static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146PyObject *fixup(PyUnicodeObject *self,
5147 int (*fixfct)(PyUnicodeObject *s))
5148{
5149
5150 PyUnicodeObject *u;
5151
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005152 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153 if (u == NULL)
5154 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005155
5156 Py_UNICODE_COPY(u->str, self->str, self->length);
5157
Tim Peters7a29bd52001-09-12 03:03:31 +00005158 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159 /* fixfct should return TRUE if it modified the buffer. If
5160 FALSE, return a reference to the original buffer instead
5161 (to save space, not time) */
5162 Py_INCREF(self);
5163 Py_DECREF(u);
5164 return (PyObject*) self;
5165 }
5166 return (PyObject*) u;
5167}
5168
Tim Petersced69f82003-09-16 20:30:58 +00005169static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170int fixupper(PyUnicodeObject *self)
5171{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005172 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173 Py_UNICODE *s = self->str;
5174 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005175
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 while (len-- > 0) {
5177 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005178
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 ch = Py_UNICODE_TOUPPER(*s);
5180 if (ch != *s) {
5181 status = 1;
5182 *s = ch;
5183 }
5184 s++;
5185 }
5186
5187 return status;
5188}
5189
Tim Petersced69f82003-09-16 20:30:58 +00005190static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191int fixlower(PyUnicodeObject *self)
5192{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005193 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 Py_UNICODE *s = self->str;
5195 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005196
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 while (len-- > 0) {
5198 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005199
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 ch = Py_UNICODE_TOLOWER(*s);
5201 if (ch != *s) {
5202 status = 1;
5203 *s = ch;
5204 }
5205 s++;
5206 }
5207
5208 return status;
5209}
5210
Tim Petersced69f82003-09-16 20:30:58 +00005211static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212int fixswapcase(PyUnicodeObject *self)
5213{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005214 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215 Py_UNICODE *s = self->str;
5216 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005217
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 while (len-- > 0) {
5219 if (Py_UNICODE_ISUPPER(*s)) {
5220 *s = Py_UNICODE_TOLOWER(*s);
5221 status = 1;
5222 } else if (Py_UNICODE_ISLOWER(*s)) {
5223 *s = Py_UNICODE_TOUPPER(*s);
5224 status = 1;
5225 }
5226 s++;
5227 }
5228
5229 return status;
5230}
5231
Tim Petersced69f82003-09-16 20:30:58 +00005232static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233int fixcapitalize(PyUnicodeObject *self)
5234{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005236 Py_UNICODE *s = self->str;
5237 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005238
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005239 if (len == 0)
5240 return 0;
5241 if (Py_UNICODE_ISLOWER(*s)) {
5242 *s = Py_UNICODE_TOUPPER(*s);
5243 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005245 s++;
5246 while (--len > 0) {
5247 if (Py_UNICODE_ISUPPER(*s)) {
5248 *s = Py_UNICODE_TOLOWER(*s);
5249 status = 1;
5250 }
5251 s++;
5252 }
5253 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254}
5255
5256static
5257int fixtitle(PyUnicodeObject *self)
5258{
5259 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5260 register Py_UNICODE *e;
5261 int previous_is_cased;
5262
5263 /* Shortcut for single character strings */
5264 if (PyUnicode_GET_SIZE(self) == 1) {
5265 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5266 if (*p != ch) {
5267 *p = ch;
5268 return 1;
5269 }
5270 else
5271 return 0;
5272 }
Tim Petersced69f82003-09-16 20:30:58 +00005273
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 e = p + PyUnicode_GET_SIZE(self);
5275 previous_is_cased = 0;
5276 for (; p < e; p++) {
5277 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005278
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279 if (previous_is_cased)
5280 *p = Py_UNICODE_TOLOWER(ch);
5281 else
5282 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005283
5284 if (Py_UNICODE_ISLOWER(ch) ||
5285 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 Py_UNICODE_ISTITLE(ch))
5287 previous_is_cased = 1;
5288 else
5289 previous_is_cased = 0;
5290 }
5291 return 1;
5292}
5293
Tim Peters8ce9f162004-08-27 01:49:32 +00005294PyObject *
5295PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296{
Tim Peters8ce9f162004-08-27 01:49:32 +00005297 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005298 const Py_UNICODE blank = ' ';
5299 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005300 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005301 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005302 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5303 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005304 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5305 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005306 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005307 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005308 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309
Tim Peters05eba1f2004-08-27 21:32:02 +00005310 fseq = PySequence_Fast(seq, "");
5311 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005312 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005313 }
5314
Tim Peters91879ab2004-08-27 22:35:44 +00005315 /* Grrrr. A codec may be invoked to convert str objects to
5316 * Unicode, and so it's possible to call back into Python code
5317 * during PyUnicode_FromObject(), and so it's possible for a sick
5318 * codec to change the size of fseq (if seq is a list). Therefore
5319 * we have to keep refetching the size -- can't assume seqlen
5320 * is invariant.
5321 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005322 seqlen = PySequence_Fast_GET_SIZE(fseq);
5323 /* If empty sequence, return u"". */
5324 if (seqlen == 0) {
5325 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5326 goto Done;
5327 }
5328 /* If singleton sequence with an exact Unicode, return that. */
5329 if (seqlen == 1) {
5330 item = PySequence_Fast_GET_ITEM(fseq, 0);
5331 if (PyUnicode_CheckExact(item)) {
5332 Py_INCREF(item);
5333 res = (PyUnicodeObject *)item;
5334 goto Done;
5335 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005336 }
5337
Tim Peters05eba1f2004-08-27 21:32:02 +00005338 /* At least two items to join, or one that isn't exact Unicode. */
5339 if (seqlen > 1) {
5340 /* Set up sep and seplen -- they're needed. */
5341 if (separator == NULL) {
5342 sep = &blank;
5343 seplen = 1;
5344 }
5345 else {
5346 internal_separator = PyUnicode_FromObject(separator);
5347 if (internal_separator == NULL)
5348 goto onError;
5349 sep = PyUnicode_AS_UNICODE(internal_separator);
5350 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005351 /* In case PyUnicode_FromObject() mutated seq. */
5352 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005353 }
5354 }
5355
5356 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005357 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005358 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005359 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005360 res_p = PyUnicode_AS_UNICODE(res);
5361 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005362
Tim Peters05eba1f2004-08-27 21:32:02 +00005363 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005364 Py_ssize_t itemlen;
5365 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005366
5367 item = PySequence_Fast_GET_ITEM(fseq, i);
5368 /* Convert item to Unicode. */
5369 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5370 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005371 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005372 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005373 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005374 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005375 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005376 item = PyUnicode_FromObject(item);
5377 if (item == NULL)
5378 goto onError;
5379 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005380
Tim Peters91879ab2004-08-27 22:35:44 +00005381 /* In case PyUnicode_FromObject() mutated seq. */
5382 seqlen = PySequence_Fast_GET_SIZE(fseq);
5383
Tim Peters8ce9f162004-08-27 01:49:32 +00005384 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005386 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005387 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005388 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005389 if (i < seqlen - 1) {
5390 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005391 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005392 goto Overflow;
5393 }
5394 if (new_res_used > res_alloc) {
5395 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005396 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005397 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005398 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005399 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005400 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005401 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005402 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005404 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005405 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005407
5408 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005409 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005410 res_p += itemlen;
5411 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005412 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005413 res_p += seplen;
5414 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005416 res_used = new_res_used;
5417 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005418
Tim Peters05eba1f2004-08-27 21:32:02 +00005419 /* Shrink res to match the used area; this probably can't fail,
5420 * but it's cheap to check.
5421 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005422 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005423 goto onError;
5424
5425 Done:
5426 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005427 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 return (PyObject *)res;
5429
Tim Peters8ce9f162004-08-27 01:49:32 +00005430 Overflow:
5431 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005432 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005433 Py_DECREF(item);
5434 /* fall through */
5435
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005437 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005438 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005439 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 return NULL;
5441}
5442
Tim Petersced69f82003-09-16 20:30:58 +00005443static
5444PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005445 Py_ssize_t left,
5446 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 Py_UNICODE fill)
5448{
5449 PyUnicodeObject *u;
5450
5451 if (left < 0)
5452 left = 0;
5453 if (right < 0)
5454 right = 0;
5455
Tim Peters7a29bd52001-09-12 03:03:31 +00005456 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 Py_INCREF(self);
5458 return self;
5459 }
5460
5461 u = _PyUnicode_New(left + self->length + right);
5462 if (u) {
5463 if (left)
5464 Py_UNICODE_FILL(u->str, fill, left);
5465 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5466 if (right)
5467 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5468 }
5469
5470 return u;
5471}
5472
5473#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005474 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 if (!str) \
5476 goto onError; \
5477 if (PyList_Append(list, str)) { \
5478 Py_DECREF(str); \
5479 goto onError; \
5480 } \
5481 else \
5482 Py_DECREF(str);
5483
5484static
5485PyObject *split_whitespace(PyUnicodeObject *self,
5486 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005487 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005489 register Py_ssize_t i;
5490 register Py_ssize_t j;
5491 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 PyObject *str;
5493
5494 for (i = j = 0; i < len; ) {
5495 /* find a token */
5496 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5497 i++;
5498 j = i;
5499 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5500 i++;
5501 if (j < i) {
5502 if (maxcount-- <= 0)
5503 break;
5504 SPLIT_APPEND(self->str, j, i);
5505 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5506 i++;
5507 j = i;
5508 }
5509 }
5510 if (j < len) {
5511 SPLIT_APPEND(self->str, j, len);
5512 }
5513 return list;
5514
5515 onError:
5516 Py_DECREF(list);
5517 return NULL;
5518}
5519
5520PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005521 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005523 register Py_ssize_t i;
5524 register Py_ssize_t j;
5525 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 PyObject *list;
5527 PyObject *str;
5528 Py_UNICODE *data;
5529
5530 string = PyUnicode_FromObject(string);
5531 if (string == NULL)
5532 return NULL;
5533 data = PyUnicode_AS_UNICODE(string);
5534 len = PyUnicode_GET_SIZE(string);
5535
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 list = PyList_New(0);
5537 if (!list)
5538 goto onError;
5539
5540 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005541 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005542
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005544 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546
5547 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005548 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 if (i < len) {
5550 if (data[i] == '\r' && i + 1 < len &&
5551 data[i+1] == '\n')
5552 i += 2;
5553 else
5554 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005555 if (keepends)
5556 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 }
Guido van Rossum86662912000-04-11 15:38:46 +00005558 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 j = i;
5560 }
5561 if (j < len) {
5562 SPLIT_APPEND(data, j, len);
5563 }
5564
5565 Py_DECREF(string);
5566 return list;
5567
5568 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005569 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 Py_DECREF(string);
5571 return NULL;
5572}
5573
Tim Petersced69f82003-09-16 20:30:58 +00005574static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575PyObject *split_char(PyUnicodeObject *self,
5576 PyObject *list,
5577 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005578 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005580 register Py_ssize_t i;
5581 register Py_ssize_t j;
5582 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 PyObject *str;
5584
5585 for (i = j = 0; i < len; ) {
5586 if (self->str[i] == ch) {
5587 if (maxcount-- <= 0)
5588 break;
5589 SPLIT_APPEND(self->str, j, i);
5590 i = j = i + 1;
5591 } else
5592 i++;
5593 }
5594 if (j <= len) {
5595 SPLIT_APPEND(self->str, j, len);
5596 }
5597 return list;
5598
5599 onError:
5600 Py_DECREF(list);
5601 return NULL;
5602}
5603
Tim Petersced69f82003-09-16 20:30:58 +00005604static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605PyObject *split_substring(PyUnicodeObject *self,
5606 PyObject *list,
5607 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005608 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005610 register Py_ssize_t i;
5611 register Py_ssize_t j;
5612 Py_ssize_t len = self->length;
5613 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 PyObject *str;
5615
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005616 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617 if (Py_UNICODE_MATCH(self, i, substring)) {
5618 if (maxcount-- <= 0)
5619 break;
5620 SPLIT_APPEND(self->str, j, i);
5621 i = j = i + sublen;
5622 } else
5623 i++;
5624 }
5625 if (j <= len) {
5626 SPLIT_APPEND(self->str, j, len);
5627 }
5628 return list;
5629
5630 onError:
5631 Py_DECREF(list);
5632 return NULL;
5633}
5634
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005635static
5636PyObject *rsplit_whitespace(PyUnicodeObject *self,
5637 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005638 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005639{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005640 register Py_ssize_t i;
5641 register Py_ssize_t j;
5642 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005643 PyObject *str;
5644
5645 for (i = j = len - 1; i >= 0; ) {
5646 /* find a token */
5647 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5648 i--;
5649 j = i;
5650 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5651 i--;
5652 if (j > i) {
5653 if (maxcount-- <= 0)
5654 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005655 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005656 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5657 i--;
5658 j = i;
5659 }
5660 }
5661 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005662 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005663 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005664 if (PyList_Reverse(list) < 0)
5665 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005666 return list;
5667
5668 onError:
5669 Py_DECREF(list);
5670 return NULL;
5671}
5672
5673static
5674PyObject *rsplit_char(PyUnicodeObject *self,
5675 PyObject *list,
5676 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005677 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005678{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005679 register Py_ssize_t i;
5680 register Py_ssize_t j;
5681 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005682 PyObject *str;
5683
5684 for (i = j = len - 1; i >= 0; ) {
5685 if (self->str[i] == ch) {
5686 if (maxcount-- <= 0)
5687 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005688 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005689 j = i = i - 1;
5690 } else
5691 i--;
5692 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005693 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005694 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005695 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005696 if (PyList_Reverse(list) < 0)
5697 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005698 return list;
5699
5700 onError:
5701 Py_DECREF(list);
5702 return NULL;
5703}
5704
5705static
5706PyObject *rsplit_substring(PyUnicodeObject *self,
5707 PyObject *list,
5708 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005709 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005710{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005711 register Py_ssize_t i;
5712 register Py_ssize_t j;
5713 Py_ssize_t len = self->length;
5714 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005715 PyObject *str;
5716
5717 for (i = len - sublen, j = len; i >= 0; ) {
5718 if (Py_UNICODE_MATCH(self, i, substring)) {
5719 if (maxcount-- <= 0)
5720 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005721 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005722 j = i;
5723 i -= sublen;
5724 } else
5725 i--;
5726 }
5727 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005728 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005729 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005730 if (PyList_Reverse(list) < 0)
5731 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005732 return list;
5733
5734 onError:
5735 Py_DECREF(list);
5736 return NULL;
5737}
5738
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739#undef SPLIT_APPEND
5740
5741static
5742PyObject *split(PyUnicodeObject *self,
5743 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005744 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745{
5746 PyObject *list;
5747
5748 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005749 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750
5751 list = PyList_New(0);
5752 if (!list)
5753 return NULL;
5754
5755 if (substring == NULL)
5756 return split_whitespace(self,list,maxcount);
5757
5758 else if (substring->length == 1)
5759 return split_char(self,list,substring->str[0],maxcount);
5760
5761 else if (substring->length == 0) {
5762 Py_DECREF(list);
5763 PyErr_SetString(PyExc_ValueError, "empty separator");
5764 return NULL;
5765 }
5766 else
5767 return split_substring(self,list,substring,maxcount);
5768}
5769
Tim Petersced69f82003-09-16 20:30:58 +00005770static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005771PyObject *rsplit(PyUnicodeObject *self,
5772 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005773 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005774{
5775 PyObject *list;
5776
5777 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005778 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005779
5780 list = PyList_New(0);
5781 if (!list)
5782 return NULL;
5783
5784 if (substring == NULL)
5785 return rsplit_whitespace(self,list,maxcount);
5786
5787 else if (substring->length == 1)
5788 return rsplit_char(self,list,substring->str[0],maxcount);
5789
5790 else if (substring->length == 0) {
5791 Py_DECREF(list);
5792 PyErr_SetString(PyExc_ValueError, "empty separator");
5793 return NULL;
5794 }
5795 else
5796 return rsplit_substring(self,list,substring,maxcount);
5797}
5798
5799static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800PyObject *replace(PyUnicodeObject *self,
5801 PyUnicodeObject *str1,
5802 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005803 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804{
5805 PyUnicodeObject *u;
5806
5807 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005808 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809
Thomas Wouters477c8d52006-05-27 19:21:47 +00005810 if (str1->length == str2->length) {
5811 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005812 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005813 if (str1->length == 1) {
5814 /* replace characters */
5815 Py_UNICODE u1, u2;
5816 if (!findchar(self->str, self->length, str1->str[0]))
5817 goto nothing;
5818 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5819 if (!u)
5820 return NULL;
5821 Py_UNICODE_COPY(u->str, self->str, self->length);
5822 u1 = str1->str[0];
5823 u2 = str2->str[0];
5824 for (i = 0; i < u->length; i++)
5825 if (u->str[i] == u1) {
5826 if (--maxcount < 0)
5827 break;
5828 u->str[i] = u2;
5829 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005831 i = fastsearch(
5832 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005834 if (i < 0)
5835 goto nothing;
5836 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5837 if (!u)
5838 return NULL;
5839 Py_UNICODE_COPY(u->str, self->str, self->length);
5840 while (i <= self->length - str1->length)
5841 if (Py_UNICODE_MATCH(self, i, str1)) {
5842 if (--maxcount < 0)
5843 break;
5844 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5845 i += str1->length;
5846 } else
5847 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005850
5851 Py_ssize_t n, i, j, e;
5852 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 Py_UNICODE *p;
5854
5855 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005856 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 if (n > maxcount)
5858 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005859 if (n == 0)
5860 goto nothing;
5861 /* new_size = self->length + n * (str2->length - str1->length)); */
5862 delta = (str2->length - str1->length);
5863 if (delta == 0) {
5864 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005866 product = n * (str2->length - str1->length);
5867 if ((product / (str2->length - str1->length)) != n) {
5868 PyErr_SetString(PyExc_OverflowError,
5869 "replace string is too long");
5870 return NULL;
5871 }
5872 new_size = self->length + product;
5873 if (new_size < 0) {
5874 PyErr_SetString(PyExc_OverflowError,
5875 "replace string is too long");
5876 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 }
5878 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005879 u = _PyUnicode_New(new_size);
5880 if (!u)
5881 return NULL;
5882 i = 0;
5883 p = u->str;
5884 e = self->length - str1->length;
5885 if (str1->length > 0) {
5886 while (n-- > 0) {
5887 /* look for next match */
5888 j = i;
5889 while (j <= e) {
5890 if (Py_UNICODE_MATCH(self, j, str1))
5891 break;
5892 j++;
5893 }
5894 if (j > i) {
5895 if (j > e)
5896 break;
5897 /* copy unchanged part [i:j] */
5898 Py_UNICODE_COPY(p, self->str+i, j-i);
5899 p += j - i;
5900 }
5901 /* copy substitution string */
5902 if (str2->length > 0) {
5903 Py_UNICODE_COPY(p, str2->str, str2->length);
5904 p += str2->length;
5905 }
5906 i = j + str1->length;
5907 }
5908 if (i < self->length)
5909 /* copy tail [i:] */
5910 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5911 } else {
5912 /* interleave */
5913 while (n > 0) {
5914 Py_UNICODE_COPY(p, str2->str, str2->length);
5915 p += str2->length;
5916 if (--n <= 0)
5917 break;
5918 *p++ = self->str[i++];
5919 }
5920 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5921 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005924
5925nothing:
5926 /* nothing to replace; return original string (when possible) */
5927 if (PyUnicode_CheckExact(self)) {
5928 Py_INCREF(self);
5929 return (PyObject *) self;
5930 }
5931 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932}
5933
5934/* --- Unicode Object Methods --------------------------------------------- */
5935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005936PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937"S.title() -> unicode\n\
5938\n\
5939Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005940characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941
5942static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005943unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 return fixup(self, fixtitle);
5946}
5947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005948PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949"S.capitalize() -> unicode\n\
5950\n\
5951Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005952have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953
5954static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005955unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 return fixup(self, fixcapitalize);
5958}
5959
5960#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005961PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962"S.capwords() -> unicode\n\
5963\n\
5964Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005965normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966
5967static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005968unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969{
5970 PyObject *list;
5971 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005972 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 /* Split into words */
5975 list = split(self, NULL, -1);
5976 if (!list)
5977 return NULL;
5978
5979 /* Capitalize each word */
5980 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5981 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5982 fixcapitalize);
5983 if (item == NULL)
5984 goto onError;
5985 Py_DECREF(PyList_GET_ITEM(list, i));
5986 PyList_SET_ITEM(list, i, item);
5987 }
5988
5989 /* Join the words to form a new string */
5990 item = PyUnicode_Join(NULL, list);
5991
5992onError:
5993 Py_DECREF(list);
5994 return (PyObject *)item;
5995}
5996#endif
5997
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005998/* Argument converter. Coerces to a single unicode character */
5999
6000static int
6001convert_uc(PyObject *obj, void *addr)
6002{
6003 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6004 PyObject *uniobj;
6005 Py_UNICODE *unistr;
6006
6007 uniobj = PyUnicode_FromObject(obj);
6008 if (uniobj == NULL) {
6009 PyErr_SetString(PyExc_TypeError,
6010 "The fill character cannot be converted to Unicode");
6011 return 0;
6012 }
6013 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6014 PyErr_SetString(PyExc_TypeError,
6015 "The fill character must be exactly one character long");
6016 Py_DECREF(uniobj);
6017 return 0;
6018 }
6019 unistr = PyUnicode_AS_UNICODE(uniobj);
6020 *fillcharloc = unistr[0];
6021 Py_DECREF(uniobj);
6022 return 1;
6023}
6024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006025PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006026"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006028Return S centered in a Unicode string of length width. Padding is\n\
6029done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030
6031static PyObject *
6032unicode_center(PyUnicodeObject *self, PyObject *args)
6033{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006034 Py_ssize_t marg, left;
6035 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006036 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037
Thomas Woutersde017742006-02-16 19:34:37 +00006038 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 return NULL;
6040
Tim Peters7a29bd52001-09-12 03:03:31 +00006041 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 Py_INCREF(self);
6043 return (PyObject*) self;
6044 }
6045
6046 marg = width - self->length;
6047 left = marg / 2 + (marg & width & 1);
6048
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006049 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050}
6051
Marc-André Lemburge5034372000-08-08 08:04:29 +00006052#if 0
6053
6054/* This code should go into some future Unicode collation support
6055 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006056 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006057
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006058/* speedy UTF-16 code point order comparison */
6059/* gleaned from: */
6060/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6061
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006062static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006063{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006064 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006065 0, 0, 0, 0, 0, 0, 0, 0,
6066 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006067 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006068};
6069
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070static int
6071unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6072{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006073 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006074
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 Py_UNICODE *s1 = str1->str;
6076 Py_UNICODE *s2 = str2->str;
6077
6078 len1 = str1->length;
6079 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006080
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006082 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006083
6084 c1 = *s1++;
6085 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006086
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006087 if (c1 > (1<<11) * 26)
6088 c1 += utf16Fixup[c1>>11];
6089 if (c2 > (1<<11) * 26)
6090 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006091 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006092
6093 if (c1 != c2)
6094 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006095
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006096 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 }
6098
6099 return (len1 < len2) ? -1 : (len1 != len2);
6100}
6101
Marc-André Lemburge5034372000-08-08 08:04:29 +00006102#else
6103
6104static int
6105unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6106{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006107 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006108
6109 Py_UNICODE *s1 = str1->str;
6110 Py_UNICODE *s2 = str2->str;
6111
6112 len1 = str1->length;
6113 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006114
Marc-André Lemburge5034372000-08-08 08:04:29 +00006115 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006116 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006117
Fredrik Lundh45714e92001-06-26 16:39:36 +00006118 c1 = *s1++;
6119 c2 = *s2++;
6120
6121 if (c1 != c2)
6122 return (c1 < c2) ? -1 : 1;
6123
Marc-André Lemburge5034372000-08-08 08:04:29 +00006124 len1--; len2--;
6125 }
6126
6127 return (len1 < len2) ? -1 : (len1 != len2);
6128}
6129
6130#endif
6131
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132int PyUnicode_Compare(PyObject *left,
6133 PyObject *right)
6134{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006135 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6136 return unicode_compare((PyUnicodeObject *)left,
6137 (PyUnicodeObject *)right);
6138 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
6139 (PyUnicode_Check(left) && PyString_Check(right))) {
6140 if (PyUnicode_Check(left))
6141 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
6142 if (PyUnicode_Check(right))
6143 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
6144 assert(PyString_Check(left));
6145 assert(PyString_Check(right));
6146 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006148 PyErr_Format(PyExc_TypeError,
6149 "Can't compare %.100s and %.100s",
6150 left->ob_type->tp_name,
6151 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152 return -1;
6153}
6154
Martin v. Löwis5b222132007-06-10 09:51:05 +00006155int
6156PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6157{
6158 int i;
6159 Py_UNICODE *id;
6160 assert(PyUnicode_Check(uni));
6161 id = PyUnicode_AS_UNICODE(uni);
6162 /* Compare Unicode string and source character set string */
6163 for (i = 0; id[i] && str[i]; i++)
6164 if (id[i] != str[i])
6165 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6166 if (id[i])
6167 return 1; /* uni is longer */
6168 if (str[i])
6169 return -1; /* str is longer */
6170 return 0;
6171}
6172
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006173PyObject *PyUnicode_RichCompare(PyObject *left,
6174 PyObject *right,
6175 int op)
6176{
6177 int result;
6178
6179 result = PyUnicode_Compare(left, right);
6180 if (result == -1 && PyErr_Occurred())
6181 goto onError;
6182
6183 /* Convert the return value to a Boolean */
6184 switch (op) {
6185 case Py_EQ:
6186 result = (result == 0);
6187 break;
6188 case Py_NE:
6189 result = (result != 0);
6190 break;
6191 case Py_LE:
6192 result = (result <= 0);
6193 break;
6194 case Py_GE:
6195 result = (result >= 0);
6196 break;
6197 case Py_LT:
6198 result = (result == -1);
6199 break;
6200 case Py_GT:
6201 result = (result == 1);
6202 break;
6203 }
6204 return PyBool_FromLong(result);
6205
6206 onError:
6207
6208 /* Standard case
6209
6210 Type errors mean that PyUnicode_FromObject() could not convert
6211 one of the arguments (usually the right hand side) to Unicode,
6212 ie. we can't handle the comparison request. However, it is
6213 possible that the other object knows a comparison method, which
6214 is why we return Py_NotImplemented to give the other object a
6215 chance.
6216
6217 */
6218 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6219 PyErr_Clear();
6220 Py_INCREF(Py_NotImplemented);
6221 return Py_NotImplemented;
6222 }
6223 if (op != Py_EQ && op != Py_NE)
6224 return NULL;
6225
6226 /* Equality comparison.
6227
6228 This is a special case: we silence any PyExc_UnicodeDecodeError
6229 and instead turn it into a PyErr_UnicodeWarning.
6230
6231 */
6232 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6233 return NULL;
6234 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006235 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6236 (op == Py_EQ) ?
6237 "Unicode equal comparison "
6238 "failed to convert both arguments to Unicode - "
6239 "interpreting them as being unequal"
6240 :
6241 "Unicode unequal comparison "
6242 "failed to convert both arguments to Unicode - "
6243 "interpreting them as being unequal",
6244 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006245 return NULL;
6246 result = (op == Py_NE);
6247 return PyBool_FromLong(result);
6248}
6249
Guido van Rossum403d68b2000-03-13 15:55:09 +00006250int PyUnicode_Contains(PyObject *container,
6251 PyObject *element)
6252{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006253 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006254 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006255
6256 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006257 sub = PyUnicode_FromObject(element);
6258 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006259 PyErr_Format(PyExc_TypeError,
6260 "'in <string>' requires string as left operand, not %s",
6261 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006262 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006263 }
6264
Thomas Wouters477c8d52006-05-27 19:21:47 +00006265 str = PyUnicode_FromObject(container);
6266 if (!str) {
6267 Py_DECREF(sub);
6268 return -1;
6269 }
6270
6271 result = stringlib_contains_obj(str, sub);
6272
6273 Py_DECREF(str);
6274 Py_DECREF(sub);
6275
Guido van Rossum403d68b2000-03-13 15:55:09 +00006276 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006277}
6278
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279/* Concat to string or Unicode object giving a new Unicode object. */
6280
6281PyObject *PyUnicode_Concat(PyObject *left,
6282 PyObject *right)
6283{
6284 PyUnicodeObject *u = NULL, *v = NULL, *w;
6285
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006286 if (PyBytes_Check(left) || PyBytes_Check(right))
6287 return PyBytes_Concat(left, right);
6288
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 /* Coerce the two arguments */
6290 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6291 if (u == NULL)
6292 goto onError;
6293 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6294 if (v == NULL)
6295 goto onError;
6296
6297 /* Shortcuts */
6298 if (v == unicode_empty) {
6299 Py_DECREF(v);
6300 return (PyObject *)u;
6301 }
6302 if (u == unicode_empty) {
6303 Py_DECREF(u);
6304 return (PyObject *)v;
6305 }
6306
6307 /* Concat the two Unicode strings */
6308 w = _PyUnicode_New(u->length + v->length);
6309 if (w == NULL)
6310 goto onError;
6311 Py_UNICODE_COPY(w->str, u->str, u->length);
6312 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6313
6314 Py_DECREF(u);
6315 Py_DECREF(v);
6316 return (PyObject *)w;
6317
6318onError:
6319 Py_XDECREF(u);
6320 Py_XDECREF(v);
6321 return NULL;
6322}
6323
Walter Dörwald1ab83302007-05-18 17:15:44 +00006324void
6325PyUnicode_Append(PyObject **pleft, PyObject *right)
6326{
6327 PyObject *new;
6328 if (*pleft == NULL)
6329 return;
6330 if (right == NULL || !PyUnicode_Check(*pleft)) {
6331 Py_DECREF(*pleft);
6332 *pleft = NULL;
6333 return;
6334 }
6335 new = PyUnicode_Concat(*pleft, right);
6336 Py_DECREF(*pleft);
6337 *pleft = new;
6338}
6339
6340void
6341PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6342{
6343 PyUnicode_Append(pleft, right);
6344 Py_XDECREF(right);
6345}
6346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006347PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348"S.count(sub[, start[, end]]) -> int\n\
6349\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006350Return the number of non-overlapping occurrences of substring sub in\n\
6351Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006352interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353
6354static PyObject *
6355unicode_count(PyUnicodeObject *self, PyObject *args)
6356{
6357 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006358 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006359 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360 PyObject *result;
6361
Guido van Rossumb8872e62000-05-09 14:14:27 +00006362 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6363 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364 return NULL;
6365
6366 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006367 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 if (substring == NULL)
6369 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006370
Thomas Wouters477c8d52006-05-27 19:21:47 +00006371 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372
Thomas Wouters477c8d52006-05-27 19:21:47 +00006373 result = PyInt_FromSsize_t(
6374 stringlib_count(self->str + start, end - start,
6375 substring->str, substring->length)
6376 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377
6378 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006379
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 return result;
6381}
6382
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006383PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006384"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006386Encodes S using the codec registered for encoding. encoding defaults\n\
6387to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006388handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006389a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6390'xmlcharrefreplace' as well as any other name registered with\n\
6391codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392
6393static PyObject *
6394unicode_encode(PyUnicodeObject *self, PyObject *args)
6395{
6396 char *encoding = NULL;
6397 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006398 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006399
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6401 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006402 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006403 if (v == NULL)
6404 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006405 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006406 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006407 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006408 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006409 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006410 Py_DECREF(v);
6411 return NULL;
6412 }
6413 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006414
6415 onError:
6416 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006417}
6418
6419PyDoc_STRVAR(decode__doc__,
6420"S.decode([encoding[,errors]]) -> string or unicode\n\
6421\n\
6422Decodes S using the codec registered for encoding. encoding defaults\n\
6423to the default encoding. errors may be given to set a different error\n\
6424handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6425a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6426as well as any other name registerd with codecs.register_error that is\n\
6427able to handle UnicodeDecodeErrors.");
6428
6429static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006430unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006431{
Guido van Rossuma74184e2007-08-29 04:05:57 +00006432 PyErr_Format(PyExc_TypeError, "decoding str is not supported");
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006433 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434}
6435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006436PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437"S.expandtabs([tabsize]) -> unicode\n\
6438\n\
6439Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006440If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441
6442static PyObject*
6443unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6444{
6445 Py_UNICODE *e;
6446 Py_UNICODE *p;
6447 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006448 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 PyUnicodeObject *u;
6450 int tabsize = 8;
6451
6452 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6453 return NULL;
6454
Thomas Wouters7e474022000-07-16 12:04:32 +00006455 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006456 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 e = self->str + self->length;
6458 for (p = self->str; p < e; p++)
6459 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006460 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006462 if (old_j > j) {
6463 PyErr_SetString(PyExc_OverflowError,
6464 "new string is too long");
6465 return NULL;
6466 }
6467 old_j = j;
6468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 }
6470 else {
6471 j++;
6472 if (*p == '\n' || *p == '\r') {
6473 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006474 old_j = j = 0;
6475 if (i < 0) {
6476 PyErr_SetString(PyExc_OverflowError,
6477 "new string is too long");
6478 return NULL;
6479 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 }
6481 }
6482
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006483 if ((i + j) < 0) {
6484 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6485 return NULL;
6486 }
6487
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 /* Second pass: create output string and fill it */
6489 u = _PyUnicode_New(i + j);
6490 if (!u)
6491 return NULL;
6492
6493 j = 0;
6494 q = u->str;
6495
6496 for (p = self->str; p < e; p++)
6497 if (*p == '\t') {
6498 if (tabsize > 0) {
6499 i = tabsize - (j % tabsize);
6500 j += i;
6501 while (i--)
6502 *q++ = ' ';
6503 }
6504 }
6505 else {
6506 j++;
6507 *q++ = *p;
6508 if (*p == '\n' || *p == '\r')
6509 j = 0;
6510 }
6511
6512 return (PyObject*) u;
6513}
6514
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006515PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516"S.find(sub [,start [,end]]) -> int\n\
6517\n\
6518Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006519such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520arguments start and end are interpreted as in slice notation.\n\
6521\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006522Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523
6524static PyObject *
6525unicode_find(PyUnicodeObject *self, PyObject *args)
6526{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006527 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006528 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006529 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006530 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531
Guido van Rossumb8872e62000-05-09 14:14:27 +00006532 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6533 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006535 substring = PyUnicode_FromObject(substring);
6536 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 return NULL;
6538
Thomas Wouters477c8d52006-05-27 19:21:47 +00006539 result = stringlib_find_slice(
6540 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6541 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6542 start, end
6543 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544
6545 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006546
6547 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548}
6549
6550static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006551unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552{
6553 if (index < 0 || index >= self->length) {
6554 PyErr_SetString(PyExc_IndexError, "string index out of range");
6555 return NULL;
6556 }
6557
6558 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6559}
6560
6561static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006562unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006564 /* Since Unicode objects compare equal to their UTF-8 string
6565 counterparts, we hash the UTF-8 string. */
6566 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6567 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568}
6569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006570PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571"S.index(sub [,start [,end]]) -> int\n\
6572\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006573Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574
6575static PyObject *
6576unicode_index(PyUnicodeObject *self, PyObject *args)
6577{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006578 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006579 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006580 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006581 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582
Guido van Rossumb8872e62000-05-09 14:14:27 +00006583 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6584 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006586 substring = PyUnicode_FromObject(substring);
6587 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 return NULL;
6589
Thomas Wouters477c8d52006-05-27 19:21:47 +00006590 result = stringlib_find_slice(
6591 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6592 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6593 start, end
6594 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595
6596 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006597
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 if (result < 0) {
6599 PyErr_SetString(PyExc_ValueError, "substring not found");
6600 return NULL;
6601 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006602
Martin v. Löwis18e16552006-02-15 17:27:45 +00006603 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604}
6605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006606PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006607"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006609Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006610at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611
6612static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006613unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614{
6615 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6616 register const Py_UNICODE *e;
6617 int cased;
6618
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 /* Shortcut for single character strings */
6620 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006621 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006623 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006624 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006625 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006626
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 e = p + PyUnicode_GET_SIZE(self);
6628 cased = 0;
6629 for (; p < e; p++) {
6630 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006631
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006633 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 else if (!cased && Py_UNICODE_ISLOWER(ch))
6635 cased = 1;
6636 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006637 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638}
6639
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006640PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006641"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006643Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006644at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645
6646static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006647unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648{
6649 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6650 register const Py_UNICODE *e;
6651 int cased;
6652
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 /* Shortcut for single character strings */
6654 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006655 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006657 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006658 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006659 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006660
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 e = p + PyUnicode_GET_SIZE(self);
6662 cased = 0;
6663 for (; p < e; p++) {
6664 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006665
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006667 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 else if (!cased && Py_UNICODE_ISUPPER(ch))
6669 cased = 1;
6670 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006671 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672}
6673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006674PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006675"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006677Return True if S is a titlecased string and there is at least one\n\
6678character in S, i.e. upper- and titlecase characters may only\n\
6679follow uncased characters and lowercase characters only cased ones.\n\
6680Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681
6682static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006683unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684{
6685 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6686 register const Py_UNICODE *e;
6687 int cased, previous_is_cased;
6688
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 /* Shortcut for single character strings */
6690 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006691 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6692 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006694 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006695 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006696 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 e = p + PyUnicode_GET_SIZE(self);
6699 cased = 0;
6700 previous_is_cased = 0;
6701 for (; p < e; p++) {
6702 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6705 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006706 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707 previous_is_cased = 1;
6708 cased = 1;
6709 }
6710 else if (Py_UNICODE_ISLOWER(ch)) {
6711 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006712 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 previous_is_cased = 1;
6714 cased = 1;
6715 }
6716 else
6717 previous_is_cased = 0;
6718 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006719 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720}
6721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006722PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006723"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006725Return True if all characters in S are whitespace\n\
6726and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727
6728static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006729unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730{
6731 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6732 register const Py_UNICODE *e;
6733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 /* Shortcut for single character strings */
6735 if (PyUnicode_GET_SIZE(self) == 1 &&
6736 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006737 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006739 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006740 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006741 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006742
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 e = p + PyUnicode_GET_SIZE(self);
6744 for (; p < e; p++) {
6745 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006746 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006748 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749}
6750
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006751PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006752"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006753\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006754Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006755and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006756
6757static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006758unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006759{
6760 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6761 register const Py_UNICODE *e;
6762
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006763 /* Shortcut for single character strings */
6764 if (PyUnicode_GET_SIZE(self) == 1 &&
6765 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006766 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006767
6768 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006769 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006770 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006771
6772 e = p + PyUnicode_GET_SIZE(self);
6773 for (; p < e; p++) {
6774 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006775 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006776 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006777 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006778}
6779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006780PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006781"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006782\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006783Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006784and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006785
6786static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006787unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006788{
6789 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6790 register const Py_UNICODE *e;
6791
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006792 /* Shortcut for single character strings */
6793 if (PyUnicode_GET_SIZE(self) == 1 &&
6794 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006795 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006796
6797 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006798 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006799 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006800
6801 e = p + PyUnicode_GET_SIZE(self);
6802 for (; p < e; p++) {
6803 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006804 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006805 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006806 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006807}
6808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006809PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006810"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006812Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006813False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814
6815static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006816unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817{
6818 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6819 register const Py_UNICODE *e;
6820
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 /* Shortcut for single character strings */
6822 if (PyUnicode_GET_SIZE(self) == 1 &&
6823 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006824 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006826 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006827 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006828 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006829
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 e = p + PyUnicode_GET_SIZE(self);
6831 for (; p < e; p++) {
6832 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006833 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006835 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836}
6837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006838PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006839"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006841Return True if all characters in S are digits\n\
6842and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843
6844static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006845unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846{
6847 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6848 register const Py_UNICODE *e;
6849
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850 /* Shortcut for single character strings */
6851 if (PyUnicode_GET_SIZE(self) == 1 &&
6852 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006853 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006855 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006856 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006857 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006858
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 e = p + PyUnicode_GET_SIZE(self);
6860 for (; p < e; p++) {
6861 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006862 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006864 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865}
6866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006867PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006868"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006870Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006871False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872
6873static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006874unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875{
6876 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6877 register const Py_UNICODE *e;
6878
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 /* Shortcut for single character strings */
6880 if (PyUnicode_GET_SIZE(self) == 1 &&
6881 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006882 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006884 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006885 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006886 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006887
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 e = p + PyUnicode_GET_SIZE(self);
6889 for (; p < e; p++) {
6890 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006891 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006893 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894}
6895
Martin v. Löwis47383402007-08-15 07:32:56 +00006896int
6897PyUnicode_IsIdentifier(PyObject *self)
6898{
6899 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6900 register const Py_UNICODE *e;
6901
6902 /* Special case for empty strings */
6903 if (PyUnicode_GET_SIZE(self) == 0)
6904 return 0;
6905
6906 /* PEP 3131 says that the first character must be in
6907 XID_Start and subsequent characters in XID_Continue,
6908 and for the ASCII range, the 2.x rules apply (i.e
6909 start with letters and underscore, continue with
6910 letters, digits, underscore). However, given the current
6911 definition of XID_Start and XID_Continue, it is sufficient
6912 to check just for these, except that _ must be allowed
6913 as starting an identifier. */
6914 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6915 return 0;
6916
6917 e = p + PyUnicode_GET_SIZE(self);
6918 for (p++; p < e; p++) {
6919 if (!_PyUnicode_IsXidContinue(*p))
6920 return 0;
6921 }
6922 return 1;
6923}
6924
6925PyDoc_STRVAR(isidentifier__doc__,
6926"S.isidentifier() -> bool\n\
6927\n\
6928Return True if S is a valid identifier according\n\
6929to the language definition.");
6930
6931static PyObject*
6932unicode_isidentifier(PyObject *self)
6933{
6934 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6935}
6936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006937PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938"S.join(sequence) -> unicode\n\
6939\n\
6940Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006941sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942
6943static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006944unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006946 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947}
6948
Martin v. Löwis18e16552006-02-15 17:27:45 +00006949static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950unicode_length(PyUnicodeObject *self)
6951{
6952 return self->length;
6953}
6954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006955PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006956"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957\n\
6958Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006959done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960
6961static PyObject *
6962unicode_ljust(PyUnicodeObject *self, PyObject *args)
6963{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006964 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006965 Py_UNICODE fillchar = ' ';
6966
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006967 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 return NULL;
6969
Tim Peters7a29bd52001-09-12 03:03:31 +00006970 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971 Py_INCREF(self);
6972 return (PyObject*) self;
6973 }
6974
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006975 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976}
6977
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006978PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979"S.lower() -> unicode\n\
6980\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006981Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982
6983static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006984unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 return fixup(self, fixlower);
6987}
6988
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006989#define LEFTSTRIP 0
6990#define RIGHTSTRIP 1
6991#define BOTHSTRIP 2
6992
6993/* Arrays indexed by above */
6994static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6995
6996#define STRIPNAME(i) (stripformat[i]+3)
6997
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006998/* externally visible for str.strip(unicode) */
6999PyObject *
7000_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7001{
7002 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007003 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007004 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007005 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7006 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007007
Thomas Wouters477c8d52006-05-27 19:21:47 +00007008 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7009
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007010 i = 0;
7011 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007012 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7013 i++;
7014 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007015 }
7016
7017 j = len;
7018 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007019 do {
7020 j--;
7021 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7022 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007023 }
7024
7025 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007026 Py_INCREF(self);
7027 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007028 }
7029 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007030 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007031}
7032
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033
7034static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007035do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007037 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007038 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007039
7040 i = 0;
7041 if (striptype != RIGHTSTRIP) {
7042 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7043 i++;
7044 }
7045 }
7046
7047 j = len;
7048 if (striptype != LEFTSTRIP) {
7049 do {
7050 j--;
7051 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7052 j++;
7053 }
7054
7055 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7056 Py_INCREF(self);
7057 return (PyObject*)self;
7058 }
7059 else
7060 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061}
7062
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007063
7064static PyObject *
7065do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7066{
7067 PyObject *sep = NULL;
7068
7069 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7070 return NULL;
7071
7072 if (sep != NULL && sep != Py_None) {
7073 if (PyUnicode_Check(sep))
7074 return _PyUnicode_XStrip(self, striptype, sep);
7075 else if (PyString_Check(sep)) {
7076 PyObject *res;
7077 sep = PyUnicode_FromObject(sep);
7078 if (sep==NULL)
7079 return NULL;
7080 res = _PyUnicode_XStrip(self, striptype, sep);
7081 Py_DECREF(sep);
7082 return res;
7083 }
7084 else {
7085 PyErr_Format(PyExc_TypeError,
7086 "%s arg must be None, unicode or str",
7087 STRIPNAME(striptype));
7088 return NULL;
7089 }
7090 }
7091
7092 return do_strip(self, striptype);
7093}
7094
7095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007096PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007097"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007098\n\
7099Return a copy of the string S with leading and trailing\n\
7100whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007101If chars is given and not None, remove characters in chars instead.\n\
7102If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007103
7104static PyObject *
7105unicode_strip(PyUnicodeObject *self, PyObject *args)
7106{
7107 if (PyTuple_GET_SIZE(args) == 0)
7108 return do_strip(self, BOTHSTRIP); /* Common case */
7109 else
7110 return do_argstrip(self, BOTHSTRIP, args);
7111}
7112
7113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007114PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007115"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007116\n\
7117Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007118If chars is given and not None, remove characters in chars instead.\n\
7119If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007120
7121static PyObject *
7122unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7123{
7124 if (PyTuple_GET_SIZE(args) == 0)
7125 return do_strip(self, LEFTSTRIP); /* Common case */
7126 else
7127 return do_argstrip(self, LEFTSTRIP, args);
7128}
7129
7130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007131PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007132"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007133\n\
7134Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007135If chars is given and not None, remove characters in chars instead.\n\
7136If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007137
7138static PyObject *
7139unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7140{
7141 if (PyTuple_GET_SIZE(args) == 0)
7142 return do_strip(self, RIGHTSTRIP); /* Common case */
7143 else
7144 return do_argstrip(self, RIGHTSTRIP, args);
7145}
7146
7147
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007149unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150{
7151 PyUnicodeObject *u;
7152 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007153 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007154 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155
7156 if (len < 0)
7157 len = 0;
7158
Tim Peters7a29bd52001-09-12 03:03:31 +00007159 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160 /* no repeat, return original string */
7161 Py_INCREF(str);
7162 return (PyObject*) str;
7163 }
Tim Peters8f422462000-09-09 06:13:41 +00007164
7165 /* ensure # of chars needed doesn't overflow int and # of bytes
7166 * needed doesn't overflow size_t
7167 */
7168 nchars = len * str->length;
7169 if (len && nchars / len != str->length) {
7170 PyErr_SetString(PyExc_OverflowError,
7171 "repeated string is too long");
7172 return NULL;
7173 }
7174 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7175 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7176 PyErr_SetString(PyExc_OverflowError,
7177 "repeated string is too long");
7178 return NULL;
7179 }
7180 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181 if (!u)
7182 return NULL;
7183
7184 p = u->str;
7185
Thomas Wouters477c8d52006-05-27 19:21:47 +00007186 if (str->length == 1 && len > 0) {
7187 Py_UNICODE_FILL(p, str->str[0], len);
7188 } else {
7189 Py_ssize_t done = 0; /* number of characters copied this far */
7190 if (done < nchars) {
7191 Py_UNICODE_COPY(p, str->str, str->length);
7192 done = str->length;
7193 }
7194 while (done < nchars) {
7195 int n = (done <= nchars-done) ? done : nchars-done;
7196 Py_UNICODE_COPY(p+done, p, n);
7197 done += n;
7198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 }
7200
7201 return (PyObject*) u;
7202}
7203
7204PyObject *PyUnicode_Replace(PyObject *obj,
7205 PyObject *subobj,
7206 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007207 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208{
7209 PyObject *self;
7210 PyObject *str1;
7211 PyObject *str2;
7212 PyObject *result;
7213
7214 self = PyUnicode_FromObject(obj);
7215 if (self == NULL)
7216 return NULL;
7217 str1 = PyUnicode_FromObject(subobj);
7218 if (str1 == NULL) {
7219 Py_DECREF(self);
7220 return NULL;
7221 }
7222 str2 = PyUnicode_FromObject(replobj);
7223 if (str2 == NULL) {
7224 Py_DECREF(self);
7225 Py_DECREF(str1);
7226 return NULL;
7227 }
Tim Petersced69f82003-09-16 20:30:58 +00007228 result = replace((PyUnicodeObject *)self,
7229 (PyUnicodeObject *)str1,
7230 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 maxcount);
7232 Py_DECREF(self);
7233 Py_DECREF(str1);
7234 Py_DECREF(str2);
7235 return result;
7236}
7237
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007238PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239"S.replace (old, new[, maxsplit]) -> unicode\n\
7240\n\
7241Return a copy of S with all occurrences of substring\n\
7242old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007243given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244
7245static PyObject*
7246unicode_replace(PyUnicodeObject *self, PyObject *args)
7247{
7248 PyUnicodeObject *str1;
7249 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007250 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251 PyObject *result;
7252
Martin v. Löwis18e16552006-02-15 17:27:45 +00007253 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254 return NULL;
7255 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7256 if (str1 == NULL)
7257 return NULL;
7258 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007259 if (str2 == NULL) {
7260 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263
7264 result = replace(self, str1, str2, maxcount);
7265
7266 Py_DECREF(str1);
7267 Py_DECREF(str2);
7268 return result;
7269}
7270
7271static
7272PyObject *unicode_repr(PyObject *unicode)
7273{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007274 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007275 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007276 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7277 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7278
7279 /* XXX(nnorwitz): rather than over-allocating, it would be
7280 better to choose a different scheme. Perhaps scan the
7281 first N-chars of the string and allocate based on that size.
7282 */
7283 /* Initial allocation is based on the longest-possible unichr
7284 escape.
7285
7286 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7287 unichr, so in this case it's the longest unichr escape. In
7288 narrow (UTF-16) builds this is five chars per source unichr
7289 since there are two unichrs in the surrogate pair, so in narrow
7290 (UTF-16) builds it's not the longest unichr escape.
7291
7292 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7293 so in the narrow (UTF-16) build case it's the longest unichr
7294 escape.
7295 */
7296
Walter Dörwald1ab83302007-05-18 17:15:44 +00007297 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007298 2 /* quotes */
7299#ifdef Py_UNICODE_WIDE
7300 + 10*size
7301#else
7302 + 6*size
7303#endif
7304 + 1);
7305 if (repr == NULL)
7306 return NULL;
7307
Walter Dörwald1ab83302007-05-18 17:15:44 +00007308 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007309
7310 /* Add quote */
7311 *p++ = (findchar(s, size, '\'') &&
7312 !findchar(s, size, '"')) ? '"' : '\'';
7313 while (size-- > 0) {
7314 Py_UNICODE ch = *s++;
7315
7316 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007317 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007318 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007319 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007320 continue;
7321 }
7322
7323#ifdef Py_UNICODE_WIDE
7324 /* Map 21-bit characters to '\U00xxxxxx' */
7325 else if (ch >= 0x10000) {
7326 *p++ = '\\';
7327 *p++ = 'U';
7328 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7329 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7330 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7331 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7332 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7333 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7334 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7335 *p++ = hexdigits[ch & 0x0000000F];
7336 continue;
7337 }
7338#else
7339 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7340 else if (ch >= 0xD800 && ch < 0xDC00) {
7341 Py_UNICODE ch2;
7342 Py_UCS4 ucs;
7343
7344 ch2 = *s++;
7345 size--;
7346 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7347 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7348 *p++ = '\\';
7349 *p++ = 'U';
7350 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7351 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7352 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7353 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7354 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7355 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7356 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7357 *p++ = hexdigits[ucs & 0x0000000F];
7358 continue;
7359 }
7360 /* Fall through: isolated surrogates are copied as-is */
7361 s--;
7362 size++;
7363 }
7364#endif
7365
7366 /* Map 16-bit characters to '\uxxxx' */
7367 if (ch >= 256) {
7368 *p++ = '\\';
7369 *p++ = 'u';
7370 *p++ = hexdigits[(ch >> 12) & 0x000F];
7371 *p++ = hexdigits[(ch >> 8) & 0x000F];
7372 *p++ = hexdigits[(ch >> 4) & 0x000F];
7373 *p++ = hexdigits[ch & 0x000F];
7374 }
7375
7376 /* Map special whitespace to '\t', \n', '\r' */
7377 else if (ch == '\t') {
7378 *p++ = '\\';
7379 *p++ = 't';
7380 }
7381 else if (ch == '\n') {
7382 *p++ = '\\';
7383 *p++ = 'n';
7384 }
7385 else if (ch == '\r') {
7386 *p++ = '\\';
7387 *p++ = 'r';
7388 }
7389
7390 /* Map non-printable US ASCII to '\xhh' */
7391 else if (ch < ' ' || ch >= 0x7F) {
7392 *p++ = '\\';
7393 *p++ = 'x';
7394 *p++ = hexdigits[(ch >> 4) & 0x000F];
7395 *p++ = hexdigits[ch & 0x000F];
7396 }
7397
7398 /* Copy everything else as-is */
7399 else
7400 *p++ = (char) ch;
7401 }
7402 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007403 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007404
7405 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007406 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007407 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408}
7409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007410PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411"S.rfind(sub [,start [,end]]) -> int\n\
7412\n\
7413Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007414such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415arguments start and end are interpreted as in slice notation.\n\
7416\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007417Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418
7419static PyObject *
7420unicode_rfind(PyUnicodeObject *self, PyObject *args)
7421{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007422 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007423 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007424 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007425 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426
Guido van Rossumb8872e62000-05-09 14:14:27 +00007427 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7428 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007430 substring = PyUnicode_FromObject(substring);
7431 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432 return NULL;
7433
Thomas Wouters477c8d52006-05-27 19:21:47 +00007434 result = stringlib_rfind_slice(
7435 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7436 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7437 start, end
7438 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439
7440 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007441
7442 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443}
7444
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007445PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446"S.rindex(sub [,start [,end]]) -> int\n\
7447\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007448Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449
7450static PyObject *
7451unicode_rindex(PyUnicodeObject *self, PyObject *args)
7452{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007453 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007454 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007455 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007456 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457
Guido van Rossumb8872e62000-05-09 14:14:27 +00007458 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7459 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007461 substring = PyUnicode_FromObject(substring);
7462 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463 return NULL;
7464
Thomas Wouters477c8d52006-05-27 19:21:47 +00007465 result = stringlib_rfind_slice(
7466 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7467 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7468 start, end
7469 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470
7471 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007472
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473 if (result < 0) {
7474 PyErr_SetString(PyExc_ValueError, "substring not found");
7475 return NULL;
7476 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007477 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478}
7479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007480PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007481"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482\n\
7483Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007484done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485
7486static PyObject *
7487unicode_rjust(PyUnicodeObject *self, PyObject *args)
7488{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007489 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007490 Py_UNICODE fillchar = ' ';
7491
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007492 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 return NULL;
7494
Tim Peters7a29bd52001-09-12 03:03:31 +00007495 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 Py_INCREF(self);
7497 return (PyObject*) self;
7498 }
7499
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007500 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501}
7502
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007504unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505{
7506 /* standard clamping */
7507 if (start < 0)
7508 start = 0;
7509 if (end < 0)
7510 end = 0;
7511 if (end > self->length)
7512 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007513 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514 /* full slice, return original string */
7515 Py_INCREF(self);
7516 return (PyObject*) self;
7517 }
7518 if (start > end)
7519 start = end;
7520 /* copy slice */
7521 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7522 end - start);
7523}
7524
7525PyObject *PyUnicode_Split(PyObject *s,
7526 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007527 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528{
7529 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007530
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 s = PyUnicode_FromObject(s);
7532 if (s == NULL)
7533 return NULL;
7534 if (sep != NULL) {
7535 sep = PyUnicode_FromObject(sep);
7536 if (sep == NULL) {
7537 Py_DECREF(s);
7538 return NULL;
7539 }
7540 }
7541
7542 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7543
7544 Py_DECREF(s);
7545 Py_XDECREF(sep);
7546 return result;
7547}
7548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007549PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550"S.split([sep [,maxsplit]]) -> list of strings\n\
7551\n\
7552Return a list of the words in S, using sep as the\n\
7553delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007554splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007555any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556
7557static PyObject*
7558unicode_split(PyUnicodeObject *self, PyObject *args)
7559{
7560 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007561 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562
Martin v. Löwis18e16552006-02-15 17:27:45 +00007563 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 return NULL;
7565
7566 if (substring == Py_None)
7567 return split(self, NULL, maxcount);
7568 else if (PyUnicode_Check(substring))
7569 return split(self, (PyUnicodeObject *)substring, maxcount);
7570 else
7571 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7572}
7573
Thomas Wouters477c8d52006-05-27 19:21:47 +00007574PyObject *
7575PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7576{
7577 PyObject* str_obj;
7578 PyObject* sep_obj;
7579 PyObject* out;
7580
7581 str_obj = PyUnicode_FromObject(str_in);
7582 if (!str_obj)
7583 return NULL;
7584 sep_obj = PyUnicode_FromObject(sep_in);
7585 if (!sep_obj) {
7586 Py_DECREF(str_obj);
7587 return NULL;
7588 }
7589
7590 out = stringlib_partition(
7591 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7592 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7593 );
7594
7595 Py_DECREF(sep_obj);
7596 Py_DECREF(str_obj);
7597
7598 return out;
7599}
7600
7601
7602PyObject *
7603PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7604{
7605 PyObject* str_obj;
7606 PyObject* sep_obj;
7607 PyObject* out;
7608
7609 str_obj = PyUnicode_FromObject(str_in);
7610 if (!str_obj)
7611 return NULL;
7612 sep_obj = PyUnicode_FromObject(sep_in);
7613 if (!sep_obj) {
7614 Py_DECREF(str_obj);
7615 return NULL;
7616 }
7617
7618 out = stringlib_rpartition(
7619 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7620 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7621 );
7622
7623 Py_DECREF(sep_obj);
7624 Py_DECREF(str_obj);
7625
7626 return out;
7627}
7628
7629PyDoc_STRVAR(partition__doc__,
7630"S.partition(sep) -> (head, sep, tail)\n\
7631\n\
7632Searches for the separator sep in S, and returns the part before it,\n\
7633the separator itself, and the part after it. If the separator is not\n\
7634found, returns S and two empty strings.");
7635
7636static PyObject*
7637unicode_partition(PyUnicodeObject *self, PyObject *separator)
7638{
7639 return PyUnicode_Partition((PyObject *)self, separator);
7640}
7641
7642PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007643"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007644\n\
7645Searches for the separator sep in S, starting at the end of S, and returns\n\
7646the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007647separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007648
7649static PyObject*
7650unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7651{
7652 return PyUnicode_RPartition((PyObject *)self, separator);
7653}
7654
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007655PyObject *PyUnicode_RSplit(PyObject *s,
7656 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007657 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007658{
7659 PyObject *result;
7660
7661 s = PyUnicode_FromObject(s);
7662 if (s == NULL)
7663 return NULL;
7664 if (sep != NULL) {
7665 sep = PyUnicode_FromObject(sep);
7666 if (sep == NULL) {
7667 Py_DECREF(s);
7668 return NULL;
7669 }
7670 }
7671
7672 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7673
7674 Py_DECREF(s);
7675 Py_XDECREF(sep);
7676 return result;
7677}
7678
7679PyDoc_STRVAR(rsplit__doc__,
7680"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7681\n\
7682Return a list of the words in S, using sep as the\n\
7683delimiter string, starting at the end of the string and\n\
7684working to the front. If maxsplit is given, at most maxsplit\n\
7685splits are done. If sep is not specified, any whitespace string\n\
7686is a separator.");
7687
7688static PyObject*
7689unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7690{
7691 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007692 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007693
Martin v. Löwis18e16552006-02-15 17:27:45 +00007694 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007695 return NULL;
7696
7697 if (substring == Py_None)
7698 return rsplit(self, NULL, maxcount);
7699 else if (PyUnicode_Check(substring))
7700 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7701 else
7702 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7703}
7704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007705PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007706"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707\n\
7708Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007709Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007710is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711
7712static PyObject*
7713unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7714{
Guido van Rossum86662912000-04-11 15:38:46 +00007715 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716
Guido van Rossum86662912000-04-11 15:38:46 +00007717 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718 return NULL;
7719
Guido van Rossum86662912000-04-11 15:38:46 +00007720 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721}
7722
7723static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007724PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725{
Walter Dörwald346737f2007-05-31 10:44:43 +00007726 if (PyUnicode_CheckExact(self)) {
7727 Py_INCREF(self);
7728 return self;
7729 } else
7730 /* Subtype -- return genuine unicode string with the same value. */
7731 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7732 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733}
7734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007735PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736"S.swapcase() -> unicode\n\
7737\n\
7738Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007739and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740
7741static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007742unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744 return fixup(self, fixswapcase);
7745}
7746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007747PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748"S.translate(table) -> unicode\n\
7749\n\
7750Return a copy of the string S, where all characters have been mapped\n\
7751through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007752Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7753Unmapped characters are left untouched. Characters mapped to None\n\
7754are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755
7756static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007757unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758{
Tim Petersced69f82003-09-16 20:30:58 +00007759 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007761 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762 "ignore");
7763}
7764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007765PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766"S.upper() -> unicode\n\
7767\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007768Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769
7770static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007771unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773 return fixup(self, fixupper);
7774}
7775
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007776PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777"S.zfill(width) -> unicode\n\
7778\n\
7779Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007780of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781
7782static PyObject *
7783unicode_zfill(PyUnicodeObject *self, PyObject *args)
7784{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007785 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786 PyUnicodeObject *u;
7787
Martin v. Löwis18e16552006-02-15 17:27:45 +00007788 Py_ssize_t width;
7789 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790 return NULL;
7791
7792 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007793 if (PyUnicode_CheckExact(self)) {
7794 Py_INCREF(self);
7795 return (PyObject*) self;
7796 }
7797 else
7798 return PyUnicode_FromUnicode(
7799 PyUnicode_AS_UNICODE(self),
7800 PyUnicode_GET_SIZE(self)
7801 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 }
7803
7804 fill = width - self->length;
7805
7806 u = pad(self, fill, 0, '0');
7807
Walter Dörwald068325e2002-04-15 13:36:47 +00007808 if (u == NULL)
7809 return NULL;
7810
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811 if (u->str[fill] == '+' || u->str[fill] == '-') {
7812 /* move sign to beginning of string */
7813 u->str[0] = u->str[fill];
7814 u->str[fill] = '0';
7815 }
7816
7817 return (PyObject*) u;
7818}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819
7820#if 0
7821static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007822unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824 return PyInt_FromLong(unicode_freelist_size);
7825}
7826#endif
7827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007828PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007829"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007831Return True if S starts with the specified prefix, False otherwise.\n\
7832With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007833With optional end, stop comparing S at that position.\n\
7834prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835
7836static PyObject *
7837unicode_startswith(PyUnicodeObject *self,
7838 PyObject *args)
7839{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007840 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007842 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007843 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007844 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007846 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007847 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007849 if (PyTuple_Check(subobj)) {
7850 Py_ssize_t i;
7851 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7852 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7853 PyTuple_GET_ITEM(subobj, i));
7854 if (substring == NULL)
7855 return NULL;
7856 result = tailmatch(self, substring, start, end, -1);
7857 Py_DECREF(substring);
7858 if (result) {
7859 Py_RETURN_TRUE;
7860 }
7861 }
7862 /* nothing matched */
7863 Py_RETURN_FALSE;
7864 }
7865 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007867 return NULL;
7868 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007870 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871}
7872
7873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007874PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007875"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007877Return True if S ends with the specified suffix, False otherwise.\n\
7878With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007879With optional end, stop comparing S at that position.\n\
7880suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881
7882static PyObject *
7883unicode_endswith(PyUnicodeObject *self,
7884 PyObject *args)
7885{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007886 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007888 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007889 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007890 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007892 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7893 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007895 if (PyTuple_Check(subobj)) {
7896 Py_ssize_t i;
7897 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7898 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7899 PyTuple_GET_ITEM(subobj, i));
7900 if (substring == NULL)
7901 return NULL;
7902 result = tailmatch(self, substring, start, end, +1);
7903 Py_DECREF(substring);
7904 if (result) {
7905 Py_RETURN_TRUE;
7906 }
7907 }
7908 Py_RETURN_FALSE;
7909 }
7910 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007912 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007914 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007915 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007916 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917}
7918
Eric Smith8c663262007-08-25 02:26:07 +00007919#include "stringlib/string_format.h"
7920
7921PyDoc_STRVAR(format__doc__,
7922"S.format(*args, **kwargs) -> unicode\n\
7923\n\
7924");
7925
7926static PyObject *
7927unicode_format(PyObject *self, PyObject *args, PyObject *kwds)
7928{
7929 /* this calls into stringlib/string_format.h because it can be
7930 included for either string or unicode. this is needed for
7931 python 2.6. */
7932 return do_string_format(self, args, kwds);
7933}
7934
7935
7936PyDoc_STRVAR(p_format__doc__,
7937"S.__format__(format_spec) -> unicode\n\
7938\n\
7939");
7940
7941static PyObject *
7942unicode__format__(PyObject *self, PyObject *args)
7943{
7944 return unicode_unicode__format__(self, args);
7945}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007947
7948static PyObject *
7949unicode_getnewargs(PyUnicodeObject *v)
7950{
7951 return Py_BuildValue("(u#)", v->str, v->length);
7952}
7953
7954
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955static PyMethodDef unicode_methods[] = {
7956
7957 /* Order is according to common usage: often used methods should
7958 appear first, since lookup is done sequentially. */
7959
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007960 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7961 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7962 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007963 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007964 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7965 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7966 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7967 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7968 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7969 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7970 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007971 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007972 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7973 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7974 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007975 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007976 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007977/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7978 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7979 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7980 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007981 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007982 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007983 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007984 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007985 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7986 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7987 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7988 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7989 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7990 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7991 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7992 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7993 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7994 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7995 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7996 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7997 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7998 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00007999 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008000 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith8c663262007-08-25 02:26:07 +00008001 {"format", (PyCFunction) unicode_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8002 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008003 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8004 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00008005#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008006 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007#endif
8008
8009#if 0
8010 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008011 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012#endif
8013
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008014 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 {NULL, NULL}
8016};
8017
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008018static PyObject *
8019unicode_mod(PyObject *v, PyObject *w)
8020{
8021 if (!PyUnicode_Check(v)) {
8022 Py_INCREF(Py_NotImplemented);
8023 return Py_NotImplemented;
8024 }
8025 return PyUnicode_Format(v, w);
8026}
8027
8028static PyNumberMethods unicode_as_number = {
8029 0, /*nb_add*/
8030 0, /*nb_subtract*/
8031 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008032 unicode_mod, /*nb_remainder*/
8033};
8034
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008036 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008037 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008038 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8039 (ssizeargfunc) unicode_getitem, /* sq_item */
8040 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 0, /* sq_ass_item */
8042 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008043 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044};
8045
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008046static PyObject*
8047unicode_subscript(PyUnicodeObject* self, PyObject* item)
8048{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008049 if (PyIndex_Check(item)) {
8050 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008051 if (i == -1 && PyErr_Occurred())
8052 return NULL;
8053 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008054 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008055 return unicode_getitem(self, i);
8056 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008057 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008058 Py_UNICODE* source_buf;
8059 Py_UNICODE* result_buf;
8060 PyObject* result;
8061
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008062 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008063 &start, &stop, &step, &slicelength) < 0) {
8064 return NULL;
8065 }
8066
8067 if (slicelength <= 0) {
8068 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008069 } else if (start == 0 && step == 1 && slicelength == self->length &&
8070 PyUnicode_CheckExact(self)) {
8071 Py_INCREF(self);
8072 return (PyObject *)self;
8073 } else if (step == 1) {
8074 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008075 } else {
8076 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008077 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8078 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008079
8080 if (result_buf == NULL)
8081 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008082
8083 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8084 result_buf[i] = source_buf[cur];
8085 }
Tim Petersced69f82003-09-16 20:30:58 +00008086
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008087 result = PyUnicode_FromUnicode(result_buf, slicelength);
8088 PyMem_FREE(result_buf);
8089 return result;
8090 }
8091 } else {
8092 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8093 return NULL;
8094 }
8095}
8096
8097static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008098 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008099 (binaryfunc)unicode_subscript, /* mp_subscript */
8100 (objobjargproc)0, /* mp_ass_subscript */
8101};
8102
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103
8104static int
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008105unicode_buffer_getbuffer(PyUnicodeObject *self, PyBuffer *view, int flags)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008108 if (flags & PyBUF_CHARACTER) {
Guido van Rossuma74184e2007-08-29 04:05:57 +00008109 PyErr_SetString(PyExc_SystemError, "can't use str as char buffer");
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00008110 abort();
Guido van Rossuma74184e2007-08-29 04:05:57 +00008111 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112 }
Guido van Rossuma74184e2007-08-29 04:05:57 +00008113 return PyBuffer_FillInfo(view, (void *)self->str,
8114 PyUnicode_GET_DATA_SIZE(self), 1, flags);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115}
8116
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008117
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118/* Helpers for PyUnicode_Format() */
8119
8120static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008121getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008123 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 if (argidx < arglen) {
8125 (*p_argidx)++;
8126 if (arglen < 0)
8127 return args;
8128 else
8129 return PyTuple_GetItem(args, argidx);
8130 }
8131 PyErr_SetString(PyExc_TypeError,
8132 "not enough arguments for format string");
8133 return NULL;
8134}
8135
8136#define F_LJUST (1<<0)
8137#define F_SIGN (1<<1)
8138#define F_BLANK (1<<2)
8139#define F_ALT (1<<3)
8140#define F_ZERO (1<<4)
8141
Martin v. Löwis18e16552006-02-15 17:27:45 +00008142static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008143strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008145 register Py_ssize_t i;
8146 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 for (i = len - 1; i >= 0; i--)
8148 buffer[i] = (Py_UNICODE) charbuffer[i];
8149
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 return len;
8151}
8152
Neal Norwitzfc76d632006-01-10 06:03:13 +00008153static int
8154doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8155{
Tim Peters15231542006-02-16 01:08:01 +00008156 Py_ssize_t result;
8157
Neal Norwitzfc76d632006-01-10 06:03:13 +00008158 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008159 result = strtounicode(buffer, (char *)buffer);
8160 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008161}
8162
8163static int
8164longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8165{
Tim Peters15231542006-02-16 01:08:01 +00008166 Py_ssize_t result;
8167
Neal Norwitzfc76d632006-01-10 06:03:13 +00008168 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008169 result = strtounicode(buffer, (char *)buffer);
8170 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008171}
8172
Guido van Rossum078151d2002-08-11 04:24:12 +00008173/* XXX To save some code duplication, formatfloat/long/int could have been
8174 shared with stringobject.c, converting from 8-bit to Unicode after the
8175 formatting is done. */
8176
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177static int
8178formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008179 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180 int flags,
8181 int prec,
8182 int type,
8183 PyObject *v)
8184{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008185 /* fmt = '%#.' + `prec` + `type`
8186 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187 char fmt[20];
8188 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008189
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190 x = PyFloat_AsDouble(v);
8191 if (x == -1.0 && PyErr_Occurred())
8192 return -1;
8193 if (prec < 0)
8194 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8196 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008197 /* Worst case length calc to ensure no buffer overrun:
8198
8199 'g' formats:
8200 fmt = %#.<prec>g
8201 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8202 for any double rep.)
8203 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8204
8205 'f' formats:
8206 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8207 len = 1 + 50 + 1 + prec = 52 + prec
8208
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008209 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008210 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008211
8212 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008213 if (((type == 'g' || type == 'G') &&
8214 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008215 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008216 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008217 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008218 return -1;
8219 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008220 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8221 (flags&F_ALT) ? "#" : "",
8222 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008223 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224}
8225
Tim Peters38fd5b62000-09-21 05:43:11 +00008226static PyObject*
8227formatlong(PyObject *val, int flags, int prec, int type)
8228{
8229 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008230 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008231 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008232 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008233
8234 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8235 if (!str)
8236 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008237 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008238 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008239 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008240}
8241
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242static int
8243formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008244 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 int flags,
8246 int prec,
8247 int type,
8248 PyObject *v)
8249{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008250 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008251 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8252 * + 1 + 1
8253 * = 24
8254 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008255 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008256 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 long x;
8258
8259 x = PyInt_AsLong(v);
8260 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008261 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008262 if (x < 0 && type == 'u') {
8263 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008264 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008265 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8266 sign = "-";
8267 else
8268 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008270 prec = 1;
8271
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008272 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8273 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008274 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008275 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008276 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008277 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008278 return -1;
8279 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008280
8281 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008282 (type == 'x' || type == 'X' || type == 'o')) {
8283 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008284 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008285 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008286 * - when 0 is being converted, the C standard leaves off
8287 * the '0x' or '0X', which is inconsistent with other
8288 * %#x/%#X conversions and inconsistent with Python's
8289 * hex() function
8290 * - there are platforms that violate the standard and
8291 * convert 0 with the '0x' or '0X'
8292 * (Metrowerks, Compaq Tru64)
8293 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008294 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008295 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008296 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008297 * We can achieve the desired consistency by inserting our
8298 * own '0x' or '0X' prefix, and substituting %x/%X in place
8299 * of %#x/%#X.
8300 *
8301 * Note that this is the same approach as used in
8302 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008303 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008304 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8305 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008306 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008307 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008308 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8309 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008310 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008311 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008312 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008313 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008314 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008315 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316}
8317
8318static int
8319formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008320 size_t buflen,
8321 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008323 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008324 if (PyUnicode_Check(v)) {
8325 if (PyUnicode_GET_SIZE(v) != 1)
8326 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008330 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008331 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008332 goto onError;
8333 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8334 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335
8336 else {
8337 /* Integer input truncated to a character */
8338 long x;
8339 x = PyInt_AsLong(v);
8340 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008341 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008342#ifdef Py_UNICODE_WIDE
8343 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008344 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008345 "%c arg not in range(0x110000) "
8346 "(wide Python build)");
8347 return -1;
8348 }
8349#else
8350 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008351 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008352 "%c arg not in range(0x10000) "
8353 "(narrow Python build)");
8354 return -1;
8355 }
8356#endif
8357 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358 }
8359 buf[1] = '\0';
8360 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008361
8362 onError:
8363 PyErr_SetString(PyExc_TypeError,
8364 "%c requires int or char");
8365 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366}
8367
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008368/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8369
8370 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8371 chars are formatted. XXX This is a magic number. Each formatting
8372 routine does bounds checking to ensure no overflow, but a better
8373 solution may be to malloc a buffer of appropriate size for each
8374 format. For now, the current solution is sufficient.
8375*/
8376#define FORMATBUFLEN (size_t)120
8377
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378PyObject *PyUnicode_Format(PyObject *format,
8379 PyObject *args)
8380{
8381 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008382 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 int args_owned = 0;
8384 PyUnicodeObject *result = NULL;
8385 PyObject *dict = NULL;
8386 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008387
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388 if (format == NULL || args == NULL) {
8389 PyErr_BadInternalCall();
8390 return NULL;
8391 }
8392 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008393 if (uformat == NULL)
8394 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 fmt = PyUnicode_AS_UNICODE(uformat);
8396 fmtcnt = PyUnicode_GET_SIZE(uformat);
8397
8398 reslen = rescnt = fmtcnt + 100;
8399 result = _PyUnicode_New(reslen);
8400 if (result == NULL)
8401 goto onError;
8402 res = PyUnicode_AS_UNICODE(result);
8403
8404 if (PyTuple_Check(args)) {
8405 arglen = PyTuple_Size(args);
8406 argidx = 0;
8407 }
8408 else {
8409 arglen = -1;
8410 argidx = -2;
8411 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008412 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008413 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414 dict = args;
8415
8416 while (--fmtcnt >= 0) {
8417 if (*fmt != '%') {
8418 if (--rescnt < 0) {
8419 rescnt = fmtcnt + 100;
8420 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008421 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008422 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8424 --rescnt;
8425 }
8426 *res++ = *fmt++;
8427 }
8428 else {
8429 /* Got a format specifier */
8430 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008431 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433 Py_UNICODE c = '\0';
8434 Py_UNICODE fill;
8435 PyObject *v = NULL;
8436 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008437 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008439 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008440 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008441
8442 fmt++;
8443 if (*fmt == '(') {
8444 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008445 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 PyObject *key;
8447 int pcount = 1;
8448
8449 if (dict == NULL) {
8450 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008451 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 goto onError;
8453 }
8454 ++fmt;
8455 --fmtcnt;
8456 keystart = fmt;
8457 /* Skip over balanced parentheses */
8458 while (pcount > 0 && --fmtcnt >= 0) {
8459 if (*fmt == ')')
8460 --pcount;
8461 else if (*fmt == '(')
8462 ++pcount;
8463 fmt++;
8464 }
8465 keylen = fmt - keystart - 1;
8466 if (fmtcnt < 0 || pcount > 0) {
8467 PyErr_SetString(PyExc_ValueError,
8468 "incomplete format key");
8469 goto onError;
8470 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008471#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008472 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473 then looked up since Python uses strings to hold
8474 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008475 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476 key = PyUnicode_EncodeUTF8(keystart,
8477 keylen,
8478 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008479#else
8480 key = PyUnicode_FromUnicode(keystart, keylen);
8481#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482 if (key == NULL)
8483 goto onError;
8484 if (args_owned) {
8485 Py_DECREF(args);
8486 args_owned = 0;
8487 }
8488 args = PyObject_GetItem(dict, key);
8489 Py_DECREF(key);
8490 if (args == NULL) {
8491 goto onError;
8492 }
8493 args_owned = 1;
8494 arglen = -1;
8495 argidx = -2;
8496 }
8497 while (--fmtcnt >= 0) {
8498 switch (c = *fmt++) {
8499 case '-': flags |= F_LJUST; continue;
8500 case '+': flags |= F_SIGN; continue;
8501 case ' ': flags |= F_BLANK; continue;
8502 case '#': flags |= F_ALT; continue;
8503 case '0': flags |= F_ZERO; continue;
8504 }
8505 break;
8506 }
8507 if (c == '*') {
8508 v = getnextarg(args, arglen, &argidx);
8509 if (v == NULL)
8510 goto onError;
8511 if (!PyInt_Check(v)) {
8512 PyErr_SetString(PyExc_TypeError,
8513 "* wants int");
8514 goto onError;
8515 }
8516 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008517 if (width == -1 && PyErr_Occurred())
8518 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519 if (width < 0) {
8520 flags |= F_LJUST;
8521 width = -width;
8522 }
8523 if (--fmtcnt >= 0)
8524 c = *fmt++;
8525 }
8526 else if (c >= '0' && c <= '9') {
8527 width = c - '0';
8528 while (--fmtcnt >= 0) {
8529 c = *fmt++;
8530 if (c < '0' || c > '9')
8531 break;
8532 if ((width*10) / 10 != width) {
8533 PyErr_SetString(PyExc_ValueError,
8534 "width too big");
8535 goto onError;
8536 }
8537 width = width*10 + (c - '0');
8538 }
8539 }
8540 if (c == '.') {
8541 prec = 0;
8542 if (--fmtcnt >= 0)
8543 c = *fmt++;
8544 if (c == '*') {
8545 v = getnextarg(args, arglen, &argidx);
8546 if (v == NULL)
8547 goto onError;
8548 if (!PyInt_Check(v)) {
8549 PyErr_SetString(PyExc_TypeError,
8550 "* wants int");
8551 goto onError;
8552 }
8553 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008554 if (prec == -1 && PyErr_Occurred())
8555 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556 if (prec < 0)
8557 prec = 0;
8558 if (--fmtcnt >= 0)
8559 c = *fmt++;
8560 }
8561 else if (c >= '0' && c <= '9') {
8562 prec = c - '0';
8563 while (--fmtcnt >= 0) {
8564 c = Py_CHARMASK(*fmt++);
8565 if (c < '0' || c > '9')
8566 break;
8567 if ((prec*10) / 10 != prec) {
8568 PyErr_SetString(PyExc_ValueError,
8569 "prec too big");
8570 goto onError;
8571 }
8572 prec = prec*10 + (c - '0');
8573 }
8574 }
8575 } /* prec */
8576 if (fmtcnt >= 0) {
8577 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 if (--fmtcnt >= 0)
8579 c = *fmt++;
8580 }
8581 }
8582 if (fmtcnt < 0) {
8583 PyErr_SetString(PyExc_ValueError,
8584 "incomplete format");
8585 goto onError;
8586 }
8587 if (c != '%') {
8588 v = getnextarg(args, arglen, &argidx);
8589 if (v == NULL)
8590 goto onError;
8591 }
8592 sign = 0;
8593 fill = ' ';
8594 switch (c) {
8595
8596 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008597 pbuf = formatbuf;
8598 /* presume that buffer length is at least 1 */
8599 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 len = 1;
8601 break;
8602
8603 case 's':
8604 case 'r':
8605 if (PyUnicode_Check(v) && c == 's') {
8606 temp = v;
8607 Py_INCREF(temp);
8608 }
8609 else {
8610 PyObject *unicode;
8611 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008612 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 else
8614 temp = PyObject_Repr(v);
8615 if (temp == NULL)
8616 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008617 if (PyUnicode_Check(temp))
8618 /* nothing to do */;
8619 else if (PyString_Check(temp)) {
8620 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008621 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008623 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008625 Py_DECREF(temp);
8626 temp = unicode;
8627 if (temp == NULL)
8628 goto onError;
8629 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008630 else {
8631 Py_DECREF(temp);
8632 PyErr_SetString(PyExc_TypeError,
8633 "%s argument has non-string str()");
8634 goto onError;
8635 }
8636 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008637 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 len = PyUnicode_GET_SIZE(temp);
8639 if (prec >= 0 && len > prec)
8640 len = prec;
8641 break;
8642
8643 case 'i':
8644 case 'd':
8645 case 'u':
8646 case 'o':
8647 case 'x':
8648 case 'X':
8649 if (c == 'i')
8650 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008651 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008652 temp = formatlong(v, flags, prec, c);
8653 if (!temp)
8654 goto onError;
8655 pbuf = PyUnicode_AS_UNICODE(temp);
8656 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008657 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008659 else {
8660 pbuf = formatbuf;
8661 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8662 flags, prec, c, v);
8663 if (len < 0)
8664 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008665 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008666 }
8667 if (flags & F_ZERO)
8668 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 break;
8670
8671 case 'e':
8672 case 'E':
8673 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008674 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 case 'g':
8676 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008677 if (c == 'F')
8678 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008679 pbuf = formatbuf;
8680 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8681 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 if (len < 0)
8683 goto onError;
8684 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008685 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 fill = '0';
8687 break;
8688
8689 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008690 pbuf = formatbuf;
8691 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 if (len < 0)
8693 goto onError;
8694 break;
8695
8696 default:
8697 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008698 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008699 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008700 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008701 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008702 (Py_ssize_t)(fmt - 1 -
8703 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 goto onError;
8705 }
8706 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008707 if (*pbuf == '-' || *pbuf == '+') {
8708 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709 len--;
8710 }
8711 else if (flags & F_SIGN)
8712 sign = '+';
8713 else if (flags & F_BLANK)
8714 sign = ' ';
8715 else
8716 sign = 0;
8717 }
8718 if (width < len)
8719 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008720 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 reslen -= rescnt;
8722 rescnt = width + fmtcnt + 100;
8723 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008724 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008725 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008726 PyErr_NoMemory();
8727 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008728 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008729 if (_PyUnicode_Resize(&result, reslen) < 0) {
8730 Py_XDECREF(temp);
8731 goto onError;
8732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733 res = PyUnicode_AS_UNICODE(result)
8734 + reslen - rescnt;
8735 }
8736 if (sign) {
8737 if (fill != ' ')
8738 *res++ = sign;
8739 rescnt--;
8740 if (width > len)
8741 width--;
8742 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008743 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008744 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008745 assert(pbuf[1] == c);
8746 if (fill != ' ') {
8747 *res++ = *pbuf++;
8748 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008749 }
Tim Petersfff53252001-04-12 18:38:48 +00008750 rescnt -= 2;
8751 width -= 2;
8752 if (width < 0)
8753 width = 0;
8754 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008755 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756 if (width > len && !(flags & F_LJUST)) {
8757 do {
8758 --rescnt;
8759 *res++ = fill;
8760 } while (--width > len);
8761 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008762 if (fill == ' ') {
8763 if (sign)
8764 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008765 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008766 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008767 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008768 *res++ = *pbuf++;
8769 *res++ = *pbuf++;
8770 }
8771 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008772 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773 res += len;
8774 rescnt -= len;
8775 while (--width >= len) {
8776 --rescnt;
8777 *res++ = ' ';
8778 }
8779 if (dict && (argidx < arglen) && c != '%') {
8780 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008781 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008782 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008783 goto onError;
8784 }
8785 Py_XDECREF(temp);
8786 } /* '%' */
8787 } /* until end */
8788 if (argidx < arglen && !dict) {
8789 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008790 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 goto onError;
8792 }
8793
Thomas Woutersa96affe2006-03-12 00:29:36 +00008794 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8795 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 if (args_owned) {
8797 Py_DECREF(args);
8798 }
8799 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 return (PyObject *)result;
8801
8802 onError:
8803 Py_XDECREF(result);
8804 Py_DECREF(uformat);
8805 if (args_owned) {
8806 Py_DECREF(args);
8807 }
8808 return NULL;
8809}
8810
8811static PyBufferProcs unicode_as_buffer = {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008812 (getbufferproc) unicode_buffer_getbuffer,
8813 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814};
8815
Jeremy Hylton938ace62002-07-17 16:30:39 +00008816static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008817unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8818
Tim Peters6d6c1a32001-08-02 04:15:00 +00008819static PyObject *
8820unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8821{
8822 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008823 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008824 char *encoding = NULL;
8825 char *errors = NULL;
8826
Guido van Rossume023fe02001-08-30 03:12:59 +00008827 if (type != &PyUnicode_Type)
8828 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008829 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8830 kwlist, &x, &encoding, &errors))
8831 return NULL;
8832 if (x == NULL)
8833 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008834 if (encoding == NULL && errors == NULL)
8835 return PyObject_Unicode(x);
8836 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008837 return PyUnicode_FromEncodedObject(x, encoding, errors);
8838}
8839
Guido van Rossume023fe02001-08-30 03:12:59 +00008840static PyObject *
8841unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8842{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008843 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008844 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008845
8846 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8847 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8848 if (tmp == NULL)
8849 return NULL;
8850 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008851 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008852 if (pnew == NULL) {
8853 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008854 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008855 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008856 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8857 if (pnew->str == NULL) {
8858 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008859 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008860 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008861 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008862 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008863 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8864 pnew->length = n;
8865 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008866 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008867 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008868}
8869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008870PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008871"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008872\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008873Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008874encoding defaults to the current default string encoding.\n\
8875errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008876
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008877static PyObject *unicode_iter(PyObject *seq);
8878
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008880 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008881 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 sizeof(PyUnicodeObject), /* tp_size */
8883 0, /* tp_itemsize */
8884 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008885 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008887 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008889 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008890 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008891 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008893 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894 (hashfunc) unicode_hash, /* tp_hash*/
8895 0, /* tp_call*/
8896 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008897 PyObject_GenericGetAttr, /* tp_getattro */
8898 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008900 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8901 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008902 unicode_doc, /* tp_doc */
8903 0, /* tp_traverse */
8904 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008905 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008906 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008907 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008908 0, /* tp_iternext */
8909 unicode_methods, /* tp_methods */
8910 0, /* tp_members */
8911 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008912 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008913 0, /* tp_dict */
8914 0, /* tp_descr_get */
8915 0, /* tp_descr_set */
8916 0, /* tp_dictoffset */
8917 0, /* tp_init */
8918 0, /* tp_alloc */
8919 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008920 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921};
8922
8923/* Initialize the Unicode implementation */
8924
Thomas Wouters78890102000-07-22 19:25:51 +00008925void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008927 int i;
8928
Thomas Wouters477c8d52006-05-27 19:21:47 +00008929 /* XXX - move this array to unicodectype.c ? */
8930 Py_UNICODE linebreak[] = {
8931 0x000A, /* LINE FEED */
8932 0x000D, /* CARRIAGE RETURN */
8933 0x001C, /* FILE SEPARATOR */
8934 0x001D, /* GROUP SEPARATOR */
8935 0x001E, /* RECORD SEPARATOR */
8936 0x0085, /* NEXT LINE */
8937 0x2028, /* LINE SEPARATOR */
8938 0x2029, /* PARAGRAPH SEPARATOR */
8939 };
8940
Fred Drakee4315f52000-05-09 19:53:39 +00008941 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008942 unicode_freelist = NULL;
8943 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008945 if (!unicode_empty)
8946 return;
8947
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008948 for (i = 0; i < 256; i++)
8949 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008950 if (PyType_Ready(&PyUnicode_Type) < 0)
8951 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008952
8953 /* initialize the linebreak bloom filter */
8954 bloom_linebreak = make_bloom_mask(
8955 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8956 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008957
8958 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959}
8960
8961/* Finalize the Unicode implementation */
8962
8963void
Thomas Wouters78890102000-07-22 19:25:51 +00008964_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008966 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008967 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008969 Py_XDECREF(unicode_empty);
8970 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008971
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008972 for (i = 0; i < 256; i++) {
8973 if (unicode_latin1[i]) {
8974 Py_DECREF(unicode_latin1[i]);
8975 unicode_latin1[i] = NULL;
8976 }
8977 }
8978
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008979 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 PyUnicodeObject *v = u;
8981 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008982 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008983 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008984 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008985 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008987 unicode_freelist = NULL;
8988 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008990
Walter Dörwald16807132007-05-25 13:52:07 +00008991void
8992PyUnicode_InternInPlace(PyObject **p)
8993{
8994 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8995 PyObject *t;
8996 if (s == NULL || !PyUnicode_Check(s))
8997 Py_FatalError(
8998 "PyUnicode_InternInPlace: unicode strings only please!");
8999 /* If it's a subclass, we don't really know what putting
9000 it in the interned dict might do. */
9001 if (!PyUnicode_CheckExact(s))
9002 return;
9003 if (PyUnicode_CHECK_INTERNED(s))
9004 return;
9005 if (interned == NULL) {
9006 interned = PyDict_New();
9007 if (interned == NULL) {
9008 PyErr_Clear(); /* Don't leave an exception */
9009 return;
9010 }
9011 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009012 /* It might be that the GetItem call fails even
9013 though the key is present in the dictionary,
9014 namely when this happens during a stack overflow. */
9015 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009016 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009017 Py_END_ALLOW_RECURSION
9018
Walter Dörwald16807132007-05-25 13:52:07 +00009019 if (t) {
9020 Py_INCREF(t);
9021 Py_DECREF(*p);
9022 *p = t;
9023 return;
9024 }
9025
Martin v. Löwis5b222132007-06-10 09:51:05 +00009026 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009027 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9028 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009029 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009030 return;
9031 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009032 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009033 /* The two references in interned are not counted by refcnt.
9034 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009035 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009036 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9037}
9038
9039void
9040PyUnicode_InternImmortal(PyObject **p)
9041{
9042 PyUnicode_InternInPlace(p);
9043 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9044 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9045 Py_INCREF(*p);
9046 }
9047}
9048
9049PyObject *
9050PyUnicode_InternFromString(const char *cp)
9051{
9052 PyObject *s = PyUnicode_FromString(cp);
9053 if (s == NULL)
9054 return NULL;
9055 PyUnicode_InternInPlace(&s);
9056 return s;
9057}
9058
9059void _Py_ReleaseInternedUnicodeStrings(void)
9060{
9061 PyObject *keys;
9062 PyUnicodeObject *s;
9063 Py_ssize_t i, n;
9064 Py_ssize_t immortal_size = 0, mortal_size = 0;
9065
9066 if (interned == NULL || !PyDict_Check(interned))
9067 return;
9068 keys = PyDict_Keys(interned);
9069 if (keys == NULL || !PyList_Check(keys)) {
9070 PyErr_Clear();
9071 return;
9072 }
9073
9074 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9075 detector, interned unicode strings are not forcibly deallocated;
9076 rather, we give them their stolen references back, and then clear
9077 and DECREF the interned dict. */
9078
9079 n = PyList_GET_SIZE(keys);
9080 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9081 n);
9082 for (i = 0; i < n; i++) {
9083 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9084 switch (s->state) {
9085 case SSTATE_NOT_INTERNED:
9086 /* XXX Shouldn't happen */
9087 break;
9088 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009089 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009090 immortal_size += s->length;
9091 break;
9092 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009093 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009094 mortal_size += s->length;
9095 break;
9096 default:
9097 Py_FatalError("Inconsistent interned string state.");
9098 }
9099 s->state = SSTATE_NOT_INTERNED;
9100 }
9101 fprintf(stderr, "total size of all interned strings: "
9102 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9103 "mortal/immortal\n", mortal_size, immortal_size);
9104 Py_DECREF(keys);
9105 PyDict_Clear(interned);
9106 Py_DECREF(interned);
9107 interned = NULL;
9108}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009109
9110
9111/********************* Unicode Iterator **************************/
9112
9113typedef struct {
9114 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009115 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009116 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9117} unicodeiterobject;
9118
9119static void
9120unicodeiter_dealloc(unicodeiterobject *it)
9121{
9122 _PyObject_GC_UNTRACK(it);
9123 Py_XDECREF(it->it_seq);
9124 PyObject_GC_Del(it);
9125}
9126
9127static int
9128unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9129{
9130 Py_VISIT(it->it_seq);
9131 return 0;
9132}
9133
9134static PyObject *
9135unicodeiter_next(unicodeiterobject *it)
9136{
9137 PyUnicodeObject *seq;
9138 PyObject *item;
9139
9140 assert(it != NULL);
9141 seq = it->it_seq;
9142 if (seq == NULL)
9143 return NULL;
9144 assert(PyUnicode_Check(seq));
9145
9146 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009147 item = PyUnicode_FromUnicode(
9148 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009149 if (item != NULL)
9150 ++it->it_index;
9151 return item;
9152 }
9153
9154 Py_DECREF(seq);
9155 it->it_seq = NULL;
9156 return NULL;
9157}
9158
9159static PyObject *
9160unicodeiter_len(unicodeiterobject *it)
9161{
9162 Py_ssize_t len = 0;
9163 if (it->it_seq)
9164 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9165 return PyInt_FromSsize_t(len);
9166}
9167
9168PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9169
9170static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009171 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9172 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009173 {NULL, NULL} /* sentinel */
9174};
9175
9176PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009177 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009178 "unicodeiterator", /* tp_name */
9179 sizeof(unicodeiterobject), /* tp_basicsize */
9180 0, /* tp_itemsize */
9181 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009182 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009183 0, /* tp_print */
9184 0, /* tp_getattr */
9185 0, /* tp_setattr */
9186 0, /* tp_compare */
9187 0, /* tp_repr */
9188 0, /* tp_as_number */
9189 0, /* tp_as_sequence */
9190 0, /* tp_as_mapping */
9191 0, /* tp_hash */
9192 0, /* tp_call */
9193 0, /* tp_str */
9194 PyObject_GenericGetAttr, /* tp_getattro */
9195 0, /* tp_setattro */
9196 0, /* tp_as_buffer */
9197 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9198 0, /* tp_doc */
9199 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9200 0, /* tp_clear */
9201 0, /* tp_richcompare */
9202 0, /* tp_weaklistoffset */
9203 PyObject_SelfIter, /* tp_iter */
9204 (iternextfunc)unicodeiter_next, /* tp_iternext */
9205 unicodeiter_methods, /* tp_methods */
9206 0,
9207};
9208
9209static PyObject *
9210unicode_iter(PyObject *seq)
9211{
9212 unicodeiterobject *it;
9213
9214 if (!PyUnicode_Check(seq)) {
9215 PyErr_BadInternalCall();
9216 return NULL;
9217 }
9218 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9219 if (it == NULL)
9220 return NULL;
9221 it->it_index = 0;
9222 Py_INCREF(seq);
9223 it->it_seq = (PyUnicodeObject *)seq;
9224 _PyObject_GC_TRACK(it);
9225 return (PyObject *)it;
9226}
9227
Martin v. Löwis5b222132007-06-10 09:51:05 +00009228size_t
9229Py_UNICODE_strlen(const Py_UNICODE *u)
9230{
9231 int res = 0;
9232 while(*u++)
9233 res++;
9234 return res;
9235}
9236
9237Py_UNICODE*
9238Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9239{
9240 Py_UNICODE *u = s1;
9241 while ((*u++ = *s2++));
9242 return s1;
9243}
9244
9245Py_UNICODE*
9246Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9247{
9248 Py_UNICODE *u = s1;
9249 while ((*u++ = *s2++))
9250 if (n-- == 0)
9251 break;
9252 return s1;
9253}
9254
9255int
9256Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9257{
9258 while (*s1 && *s2 && *s1 == *s2)
9259 s1++, s2++;
9260 if (*s1 && *s2)
9261 return (*s1 < *s2) ? -1 : +1;
9262 if (*s1)
9263 return 1;
9264 if (*s2)
9265 return -1;
9266 return 0;
9267}
9268
9269Py_UNICODE*
9270Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9271{
9272 const Py_UNICODE *p;
9273 for (p = s; *p; p++)
9274 if (*p == c)
9275 return (Py_UNICODE*)p;
9276 return NULL;
9277}
9278
9279
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009280#ifdef __cplusplus
9281}
9282#endif
9283
9284
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009285/*
9286Local variables:
9287c-basic-offset: 4
9288indent-tabs-mode: nil
9289End:
9290*/