blob: 4e8b2ed3f11037c4c846533d1e04bcab19dc4bbf [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Eric Smith8c663262007-08-25 02:26:07 +000048#include "formatter_unicode.h"
49
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
56#define MAX_UNICODE_FREELIST_SIZE 1024
57
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Barry Warsaw51ac5802000-03-20 16:36:48 +000064 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Walter Dörwald16807132007-05-25 13:52:07 +000097/* This dictionary holds all interned unicode strings. Note that references
98 to strings in this dictionary are *not* counted in the string's ob_refcnt.
99 When the interned string reaches a refcnt of 0 the string deallocation
100 function will delete the reference from this dictionary.
101
102 Another way to look at this is that to say that the actual reference
103 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
104*/
105static PyObject *interned;
106
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000108static PyUnicodeObject *unicode_freelist;
109static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000111/* The empty Unicode object is shared to improve performance. */
112static PyUnicodeObject *unicode_empty;
113
114/* Single character Unicode strings in the Latin-1 range are being
115 shared as well. */
116static PyUnicodeObject *unicode_latin1[256];
117
Fred Drakee4315f52000-05-09 19:53:39 +0000118/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000119 parameter; it is fixed to "utf-8". Always use the
120 PyUnicode_GetDefaultEncoding() API to access this global. */
121static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000122
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000124PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000126#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000127 return 0x10FFFF;
128#else
129 /* This is actually an illegal character, so it should
130 not be passed to unichr. */
131 return 0xFFFF;
132#endif
133}
134
Thomas Wouters477c8d52006-05-27 19:21:47 +0000135/* --- Bloom Filters ----------------------------------------------------- */
136
137/* stuff to implement simple "bloom filters" for Unicode characters.
138 to keep things simple, we use a single bitmask, using the least 5
139 bits from each unicode characters as the bit index. */
140
141/* the linebreak mask is set up by Unicode_Init below */
142
143#define BLOOM_MASK unsigned long
144
145static BLOOM_MASK bloom_linebreak;
146
147#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
148
149#define BLOOM_LINEBREAK(ch)\
150 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
151
152Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
153{
154 /* calculate simple bloom-style bitmask for a given unicode string */
155
156 long mask;
157 Py_ssize_t i;
158
159 mask = 0;
160 for (i = 0; i < len; i++)
161 mask |= (1 << (ptr[i] & 0x1F));
162
163 return mask;
164}
165
166Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
167{
168 Py_ssize_t i;
169
170 for (i = 0; i < setlen; i++)
171 if (set[i] == chr)
172 return 1;
173
174 return 0;
175}
176
177#define BLOOM_MEMBER(mask, chr, set, setlen)\
178 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
179
Guido van Rossumd57fd912000-03-10 22:53:23 +0000180/* --- Unicode Object ----------------------------------------------------- */
181
182static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000184 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185{
186 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000187
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000191
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192 /* Resizing shared object (unicode_empty or single character
193 objects) in-place is not allowed. Use PyUnicode_Resize()
194 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000195
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000196 if (unicode == unicode_empty ||
197 (unicode->length == 1 &&
198 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 return -1;
203 }
204
Thomas Wouters477c8d52006-05-27 19:21:47 +0000205 /* We allocate one more byte to make sure the string is Ux0000 terminated.
206 The overallocation is also used by fastsearch, which assumes that it's
207 safe to look at str[length] (without making any assumptions about what
208 it contains). */
209
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 oldstr = unicode->str;
211 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
212 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000213 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 PyErr_NoMemory();
215 return -1;
216 }
217 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000222 if (unicode->defenc) {
223 Py_DECREF(unicode->defenc);
224 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000227
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 return 0;
229}
230
231/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000232 Ux0000 terminated; some code (e.g. new_identifier)
233 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234
235 XXX This allocator could further be enhanced by assuring that the
236 free list never reduces its size below 1.
237
238*/
239
240static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000241PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242{
243 register PyUnicodeObject *unicode;
244
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 if (length == 0 && unicode_empty != NULL) {
247 Py_INCREF(unicode_empty);
248 return unicode_empty;
249 }
250
251 /* Unicode freelist & memory allocation */
252 if (unicode_freelist) {
253 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000254 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000257 /* Keep-Alive optimization: we only upsize the buffer,
258 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000259 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000260 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000265 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000267 }
268 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 }
270 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000271 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 if (unicode == NULL)
273 return NULL;
274 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
275 }
276
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 if (!unicode->str) {
278 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000279 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000280 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000281 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000282 * the caller fails before initializing str -- unicode_resize()
283 * reads str[0], and the Keep-Alive optimization can keep memory
284 * allocated for str alive across a call to unicode_dealloc(unicode).
285 * We don't want unicode_resize to read uninitialized memory in
286 * that case.
287 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000288 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000290 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000292 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000293 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000295
296 onError:
297 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000298 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300}
301
302static
Guido van Rossum9475a232001-10-05 20:51:39 +0000303void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304{
Walter Dörwald16807132007-05-25 13:52:07 +0000305 switch (PyUnicode_CHECK_INTERNED(unicode)) {
306 case SSTATE_NOT_INTERNED:
307 break;
308
309 case SSTATE_INTERNED_MORTAL:
310 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000311 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000312 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
313 Py_FatalError(
314 "deletion of interned unicode string failed");
315 break;
316
317 case SSTATE_INTERNED_IMMORTAL:
318 Py_FatalError("Immortal interned unicode string died.");
319
320 default:
321 Py_FatalError("Inconsistent interned unicode string state.");
322 }
323
Guido van Rossum604ddf82001-12-06 20:03:56 +0000324 if (PyUnicode_CheckExact(unicode) &&
325 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000326 /* Keep-Alive optimization */
327 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000328 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329 unicode->str = NULL;
330 unicode->length = 0;
331 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000332 if (unicode->defenc) {
333 Py_DECREF(unicode->defenc);
334 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000335 }
336 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 *(PyUnicodeObject **)unicode = unicode_freelist;
338 unicode_freelist = unicode;
339 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 }
341 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000342 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000343 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000344 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346}
347
Martin v. Löwis18e16552006-02-15 17:27:45 +0000348int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000349{
350 register PyUnicodeObject *v;
351
352 /* Argument checks */
353 if (unicode == NULL) {
354 PyErr_BadInternalCall();
355 return -1;
356 }
357 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000358 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 PyErr_BadInternalCall();
360 return -1;
361 }
362
363 /* Resizing unicode_empty and single character objects is not
364 possible since these are being shared. We simply return a fresh
365 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000366 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000367 (v == unicode_empty || v->length == 1)) {
368 PyUnicodeObject *w = _PyUnicode_New(length);
369 if (w == NULL)
370 return -1;
371 Py_UNICODE_COPY(w->str, v->str,
372 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000373 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000374 *unicode = (PyObject *)w;
375 return 0;
376 }
377
378 /* Note that we don't have to modify *unicode for unshared Unicode
379 objects, since we can modify them in-place. */
380 return unicode_resize(v, length);
381}
382
383/* Internal API for use in unicodeobject.c only ! */
384#define _PyUnicode_Resize(unicodevar, length) \
385 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
386
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000388 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
390 PyUnicodeObject *unicode;
391
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000392 /* If the Unicode data is known at construction time, we can apply
393 some optimizations which share commonly used objects. */
394 if (u != NULL) {
395
396 /* Optimization for empty strings */
397 if (size == 0 && unicode_empty != NULL) {
398 Py_INCREF(unicode_empty);
399 return (PyObject *)unicode_empty;
400 }
401
402 /* Single character Unicode objects in the Latin-1 range are
403 shared when using this constructor */
404 if (size == 1 && *u < 256) {
405 unicode = unicode_latin1[*u];
406 if (!unicode) {
407 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 if (!unicode)
409 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000410 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000411 unicode_latin1[*u] = unicode;
412 }
413 Py_INCREF(unicode);
414 return (PyObject *)unicode;
415 }
416 }
Tim Petersced69f82003-09-16 20:30:58 +0000417
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 unicode = _PyUnicode_New(size);
419 if (!unicode)
420 return NULL;
421
422 /* Copy the Unicode data into the new object */
423 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000424 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425
426 return (PyObject *)unicode;
427}
428
Walter Dörwaldd2034312007-05-18 16:29:38 +0000429PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000430{
431 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000432 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000433 some optimizations which share commonly used objects.
434 Also, this means the input must be UTF-8, so fall back to the
435 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000436 if (u != NULL) {
437
438 /* Optimization for empty strings */
439 if (size == 0 && unicode_empty != NULL) {
440 Py_INCREF(unicode_empty);
441 return (PyObject *)unicode_empty;
442 }
443
Martin v. Löwis9c121062007-08-05 20:26:11 +0000444 /* Single characters are shared when using this constructor.
445 Restrict to ASCII, since the input must be UTF-8. */
446 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000447 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 if (!unicode) {
449 unicode = _PyUnicode_New(1);
450 if (!unicode)
451 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000452 unicode->str[0] = Py_CHARMASK(*u);
453 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000454 }
455 Py_INCREF(unicode);
456 return (PyObject *)unicode;
457 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000458
459 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000460 }
461
Walter Dörwald55507312007-05-18 13:12:10 +0000462 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000463 if (!unicode)
464 return NULL;
465
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000466 return (PyObject *)unicode;
467}
468
Walter Dörwaldd2034312007-05-18 16:29:38 +0000469PyObject *PyUnicode_FromString(const char *u)
470{
471 size_t size = strlen(u);
472 if (size > PY_SSIZE_T_MAX) {
473 PyErr_SetString(PyExc_OverflowError, "input too long");
474 return NULL;
475 }
476
477 return PyUnicode_FromStringAndSize(u, size);
478}
479
Guido van Rossumd57fd912000-03-10 22:53:23 +0000480#ifdef HAVE_WCHAR_H
481
482PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000483 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484{
485 PyUnicodeObject *unicode;
486
487 if (w == NULL) {
488 PyErr_BadInternalCall();
489 return NULL;
490 }
491
492 unicode = _PyUnicode_New(size);
493 if (!unicode)
494 return NULL;
495
496 /* Copy the wchar_t data into the new object */
497#ifdef HAVE_USABLE_WCHAR_T
498 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000499#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 {
501 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000502 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000504 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 *u++ = *w++;
506 }
507#endif
508
509 return (PyObject *)unicode;
510}
511
Walter Dörwald346737f2007-05-31 10:44:43 +0000512static void
513makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
514{
515 *fmt++ = '%';
516 if (width) {
517 if (zeropad)
518 *fmt++ = '0';
519 fmt += sprintf(fmt, "%d", width);
520 }
521 if (precision)
522 fmt += sprintf(fmt, ".%d", precision);
523 if (longflag)
524 *fmt++ = 'l';
525 else if (size_tflag) {
526 char *f = PY_FORMAT_SIZE_T;
527 while (*f)
528 *fmt++ = *f++;
529 }
530 *fmt++ = c;
531 *fmt = '\0';
532}
533
Walter Dörwaldd2034312007-05-18 16:29:38 +0000534#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
535
536PyObject *
537PyUnicode_FromFormatV(const char *format, va_list vargs)
538{
539 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000540 Py_ssize_t callcount = 0;
541 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000542 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000543 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000544 int width = 0;
545 int precision = 0;
546 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000547 const char* f;
548 Py_UNICODE *s;
549 PyObject *string;
550 /* used by sprintf */
551 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000552 /* use abuffer instead of buffer, if we need more space
553 * (which can happen if there's a format specifier with width). */
554 char *abuffer = NULL;
555 char *realbuffer;
556 Py_ssize_t abuffersize = 0;
557 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000558 const char *copy;
559
560#ifdef VA_LIST_IS_ARRAY
561 Py_MEMCPY(count, vargs, sizeof(va_list));
562#else
563#ifdef __va_copy
564 __va_copy(count, vargs);
565#else
566 count = vargs;
567#endif
568#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000569 /* step 1: count the number of %S/%R format specifications
570 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
571 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000572 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000573 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000574 ++callcount;
575 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000576 /* step 2: allocate memory for the results of
577 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000578 if (callcount) {
579 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
580 if (!callresults) {
581 PyErr_NoMemory();
582 return NULL;
583 }
584 callresult = callresults;
585 }
586 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000587 for (f = format; *f; f++) {
588 if (*f == '%') {
589 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000590 width = 0;
591 while (isdigit(Py_CHARMASK(*f)))
592 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000593 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
594 ;
595
596 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
597 * they don't affect the amount of space we reserve.
598 */
599 if ((*f == 'l' || *f == 'z') &&
600 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000601 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000602
603 switch (*f) {
604 case 'c':
605 (void)va_arg(count, int);
606 /* fall through... */
607 case '%':
608 n++;
609 break;
610 case 'd': case 'u': case 'i': case 'x':
611 (void) va_arg(count, int);
612 /* 20 bytes is enough to hold a 64-bit
613 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000614 This isn't enough for octal.
615 If a width is specified we need more
616 (which we allocate later). */
617 if (width < 20)
618 width = 20;
619 n += width;
620 if (abuffersize < width)
621 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622 break;
623 case 's':
624 n += strlen(va_arg(count, char*));
625 break;
626 case 'U':
627 {
628 PyObject *obj = va_arg(count, PyObject *);
629 assert(obj && PyUnicode_Check(obj));
630 n += PyUnicode_GET_SIZE(obj);
631 break;
632 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000633 case 'V':
634 {
635 PyObject *obj = va_arg(count, PyObject *);
636 const char *str = va_arg(count, const char *);
637 assert(obj || str);
638 assert(!obj || PyUnicode_Check(obj));
639 if (obj)
640 n += PyUnicode_GET_SIZE(obj);
641 else
642 n += strlen(str);
643 break;
644 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000645 case 'S':
646 {
647 PyObject *obj = va_arg(count, PyObject *);
648 PyObject *str;
649 assert(obj);
650 str = PyObject_Unicode(obj);
651 if (!str)
652 goto fail;
653 n += PyUnicode_GET_SIZE(str);
654 /* Remember the str and switch to the next slot */
655 *callresult++ = str;
656 break;
657 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000658 case 'R':
659 {
660 PyObject *obj = va_arg(count, PyObject *);
661 PyObject *repr;
662 assert(obj);
663 repr = PyObject_Repr(obj);
664 if (!repr)
665 goto fail;
666 n += PyUnicode_GET_SIZE(repr);
667 /* Remember the repr and switch to the next slot */
668 *callresult++ = repr;
669 break;
670 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000671 case 'p':
672 (void) va_arg(count, int);
673 /* maximum 64-bit pointer representation:
674 * 0xffffffffffffffff
675 * so 19 characters is enough.
676 * XXX I count 18 -- what's the extra for?
677 */
678 n += 19;
679 break;
680 default:
681 /* if we stumble upon an unknown
682 formatting code, copy the rest of
683 the format string to the output
684 string. (we cannot just skip the
685 code, since there's no way to know
686 what's in the argument list) */
687 n += strlen(p);
688 goto expand;
689 }
690 } else
691 n++;
692 }
693 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000694 if (abuffersize > 20) {
695 abuffer = PyMem_Malloc(abuffersize);
696 if (!abuffer) {
697 PyErr_NoMemory();
698 goto fail;
699 }
700 realbuffer = abuffer;
701 }
702 else
703 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000704 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000705 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000706 we don't have to resize the string.
707 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000708 string = PyUnicode_FromUnicode(NULL, n);
709 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000710 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711
712 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000713 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000714
715 for (f = format; *f; f++) {
716 if (*f == '%') {
717 const char* p = f++;
718 int longflag = 0;
719 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000720 zeropad = (*f == '0');
721 /* parse the width.precision part */
722 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000723 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000724 width = (width*10) + *f++ - '0';
725 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000726 if (*f == '.') {
727 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000729 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000730 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731 /* handle the long flag, but only for %ld and %lu.
732 others can be added when necessary. */
733 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
734 longflag = 1;
735 ++f;
736 }
737 /* handle the size_t flag. */
738 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
739 size_tflag = 1;
740 ++f;
741 }
742
743 switch (*f) {
744 case 'c':
745 *s++ = va_arg(vargs, int);
746 break;
747 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000748 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000749 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000750 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000751 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000752 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000753 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000754 sprintf(realbuffer, fmt, va_arg(vargs, int));
755 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000756 break;
757 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000758 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000762 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000764 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
765 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 break;
767 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000768 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
769 sprintf(realbuffer, fmt, va_arg(vargs, int));
770 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000771 break;
772 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000773 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
774 sprintf(realbuffer, fmt, va_arg(vargs, int));
775 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000776 break;
777 case 's':
778 p = va_arg(vargs, char*);
779 appendstring(p);
780 break;
781 case 'U':
782 {
783 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000784 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
785 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
786 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000787 break;
788 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000789 case 'V':
790 {
791 PyObject *obj = va_arg(vargs, PyObject *);
792 const char *str = va_arg(vargs, const char *);
793 if (obj) {
794 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
795 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
796 s += size;
797 } else {
798 appendstring(str);
799 }
800 break;
801 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000802 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000803 case 'R':
804 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000805 Py_UNICODE *ucopy;
806 Py_ssize_t usize;
807 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000808 /* unused, since we already have the result */
809 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000810 ucopy = PyUnicode_AS_UNICODE(*callresult);
811 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000812 for (upos = 0; upos<usize;)
813 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000814 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000815 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000816 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000817 ++callresult;
818 break;
819 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820 case 'p':
821 sprintf(buffer, "%p", va_arg(vargs, void*));
822 /* %p is ill-defined: ensure leading 0x. */
823 if (buffer[1] == 'X')
824 buffer[1] = 'x';
825 else if (buffer[1] != 'x') {
826 memmove(buffer+2, buffer, strlen(buffer)+1);
827 buffer[0] = '0';
828 buffer[1] = 'x';
829 }
830 appendstring(buffer);
831 break;
832 case '%':
833 *s++ = '%';
834 break;
835 default:
836 appendstring(p);
837 goto end;
838 }
839 } else
840 *s++ = *f;
841 }
842
843 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000844 if (callresults)
845 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000846 if (abuffer)
847 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000848 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
849 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000850 fail:
851 if (callresults) {
852 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000853 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000854 Py_DECREF(*callresult2);
855 ++callresult2;
856 }
857 PyMem_Free(callresults);
858 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000859 if (abuffer)
860 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000861 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000862}
863
864#undef appendstring
865
866PyObject *
867PyUnicode_FromFormat(const char *format, ...)
868{
869 PyObject* ret;
870 va_list vargs;
871
872#ifdef HAVE_STDARG_PROTOTYPES
873 va_start(vargs, format);
874#else
875 va_start(vargs);
876#endif
877 ret = PyUnicode_FromFormatV(format, vargs);
878 va_end(vargs);
879 return ret;
880}
881
Martin v. Löwis18e16552006-02-15 17:27:45 +0000882Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
883 wchar_t *w,
884 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885{
886 if (unicode == NULL) {
887 PyErr_BadInternalCall();
888 return -1;
889 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000890
891 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000893 size = PyUnicode_GET_SIZE(unicode) + 1;
894
Guido van Rossumd57fd912000-03-10 22:53:23 +0000895#ifdef HAVE_USABLE_WCHAR_T
896 memcpy(w, unicode->str, size * sizeof(wchar_t));
897#else
898 {
899 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000900 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000901 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000902 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000903 *w++ = *u++;
904 }
905#endif
906
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000907 if (size > PyUnicode_GET_SIZE(unicode))
908 return PyUnicode_GET_SIZE(unicode);
909 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000910 return size;
911}
912
913#endif
914
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000915PyObject *PyUnicode_FromOrdinal(int ordinal)
916{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000917 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000918
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919 if (ordinal < 0 || ordinal > 0x10ffff) {
920 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000921 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000922 return NULL;
923 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000924
925#ifndef Py_UNICODE_WIDE
926 if (ordinal > 0xffff) {
927 ordinal -= 0x10000;
928 s[0] = 0xD800 | (ordinal >> 10);
929 s[1] = 0xDC00 | (ordinal & 0x3FF);
930 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000931 }
932#endif
933
Hye-Shik Chang40574832004-04-06 07:24:51 +0000934 s[0] = (Py_UNICODE)ordinal;
935 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000936}
937
Guido van Rossumd57fd912000-03-10 22:53:23 +0000938PyObject *PyUnicode_FromObject(register PyObject *obj)
939{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000940 /* XXX Perhaps we should make this API an alias of
941 PyObject_Unicode() instead ?! */
942 if (PyUnicode_CheckExact(obj)) {
943 Py_INCREF(obj);
944 return obj;
945 }
946 if (PyUnicode_Check(obj)) {
947 /* For a Unicode subtype that's not a Unicode object,
948 return a true Unicode object with the same data. */
949 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
950 PyUnicode_GET_SIZE(obj));
951 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000952 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
953}
954
955PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
956 const char *encoding,
957 const char *errors)
958{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000959 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000961 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000962
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963 if (obj == NULL) {
964 PyErr_BadInternalCall();
965 return NULL;
966 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000967
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000968 if (PyUnicode_Check(obj)) {
969 PyErr_SetString(PyExc_TypeError,
970 "decoding Unicode is not supported");
971 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000972 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000973
974 /* Coerce object */
975 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000976 s = PyString_AS_STRING(obj);
977 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000978 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000979 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
980 /* Overwrite the error message with something more useful in
981 case of a TypeError. */
982 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000983 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984 "coercing to Unicode: need string or buffer, "
985 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000986 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000987 goto onError;
988 }
Tim Petersced69f82003-09-16 20:30:58 +0000989
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000990 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000991 if (len == 0) {
992 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000993 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000994 }
Tim Petersced69f82003-09-16 20:30:58 +0000995 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000996 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000997
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000998 return v;
999
1000 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001001 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001002}
1003
1004PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001005 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001006 const char *encoding,
1007 const char *errors)
1008{
1009 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001010
1011 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001012 encoding = PyUnicode_GetDefaultEncoding();
1013
1014 /* Shortcuts for common default encodings */
1015 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001016 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001017 else if (strcmp(encoding, "latin-1") == 0)
1018 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001019#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1020 else if (strcmp(encoding, "mbcs") == 0)
1021 return PyUnicode_DecodeMBCS(s, size, errors);
1022#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001023 else if (strcmp(encoding, "ascii") == 0)
1024 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025
1026 /* Decode via the codec registry */
1027 buffer = PyBuffer_FromMemory((void *)s, size);
1028 if (buffer == NULL)
1029 goto onError;
1030 unicode = PyCodec_Decode(buffer, encoding, errors);
1031 if (unicode == NULL)
1032 goto onError;
1033 if (!PyUnicode_Check(unicode)) {
1034 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001035 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001036 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037 Py_DECREF(unicode);
1038 goto onError;
1039 }
1040 Py_DECREF(buffer);
1041 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001042
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 onError:
1044 Py_XDECREF(buffer);
1045 return NULL;
1046}
1047
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001048PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1049 const char *encoding,
1050 const char *errors)
1051{
1052 PyObject *v;
1053
1054 if (!PyUnicode_Check(unicode)) {
1055 PyErr_BadArgument();
1056 goto onError;
1057 }
1058
1059 if (encoding == NULL)
1060 encoding = PyUnicode_GetDefaultEncoding();
1061
1062 /* Decode via the codec registry */
1063 v = PyCodec_Decode(unicode, encoding, errors);
1064 if (v == NULL)
1065 goto onError;
1066 return v;
1067
1068 onError:
1069 return NULL;
1070}
1071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001073 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 const char *encoding,
1075 const char *errors)
1076{
1077 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001078
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 unicode = PyUnicode_FromUnicode(s, size);
1080 if (unicode == NULL)
1081 return NULL;
1082 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1083 Py_DECREF(unicode);
1084 return v;
1085}
1086
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001087PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1088 const char *encoding,
1089 const char *errors)
1090{
1091 PyObject *v;
1092
1093 if (!PyUnicode_Check(unicode)) {
1094 PyErr_BadArgument();
1095 goto onError;
1096 }
1097
1098 if (encoding == NULL)
1099 encoding = PyUnicode_GetDefaultEncoding();
1100
1101 /* Encode via the codec registry */
1102 v = PyCodec_Encode(unicode, encoding, errors);
1103 if (v == NULL)
1104 goto onError;
1105 return v;
1106
1107 onError:
1108 return NULL;
1109}
1110
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1112 const char *encoding,
1113 const char *errors)
1114{
1115 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001116
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 if (!PyUnicode_Check(unicode)) {
1118 PyErr_BadArgument();
1119 goto onError;
1120 }
Fred Drakee4315f52000-05-09 19:53:39 +00001121
Tim Petersced69f82003-09-16 20:30:58 +00001122 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001123 encoding = PyUnicode_GetDefaultEncoding();
1124
1125 /* Shortcuts for common default encodings */
1126 if (errors == NULL) {
1127 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001128 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001129 else if (strcmp(encoding, "latin-1") == 0)
1130 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001131#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1132 else if (strcmp(encoding, "mbcs") == 0)
1133 return PyUnicode_AsMBCSString(unicode);
1134#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001135 else if (strcmp(encoding, "ascii") == 0)
1136 return PyUnicode_AsASCIIString(unicode);
1137 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138
1139 /* Encode via the codec registry */
1140 v = PyCodec_Encode(unicode, encoding, errors);
1141 if (v == NULL)
1142 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001143 if (!PyBytes_Check(v)) {
1144 if (PyString_Check(v)) {
1145 /* Old codec, turn it into bytes */
1146 PyObject *b = PyBytes_FromObject(v);
1147 Py_DECREF(v);
1148 return b;
1149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001151 "encoder did not return a bytes object "
1152 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1153 v->ob_type->tp_name,
1154 encoding ? encoding : "NULL",
1155 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 Py_DECREF(v);
1157 goto onError;
1158 }
1159 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001160
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 onError:
1162 return NULL;
1163}
1164
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001165PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1166 const char *errors)
1167{
1168 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001169 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001170 if (v)
1171 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001172 if (errors != NULL)
1173 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001174 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1175 PyUnicode_GET_SIZE(unicode),
1176 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001177 if (!b)
1178 return NULL;
1179 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1180 PyBytes_Size(b));
1181 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001182 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001183 return v;
1184}
1185
Martin v. Löwis5b222132007-06-10 09:51:05 +00001186char*
1187PyUnicode_AsString(PyObject *unicode)
1188{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001189 if (!PyUnicode_Check(unicode)) {
1190 PyErr_BadArgument();
1191 return NULL;
1192 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00001193 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1194 if (!unicode)
1195 return NULL;
1196 return PyString_AsString(unicode);
1197}
1198
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1200{
1201 if (!PyUnicode_Check(unicode)) {
1202 PyErr_BadArgument();
1203 goto onError;
1204 }
1205 return PyUnicode_AS_UNICODE(unicode);
1206
1207 onError:
1208 return NULL;
1209}
1210
Martin v. Löwis18e16552006-02-15 17:27:45 +00001211Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001212{
1213 if (!PyUnicode_Check(unicode)) {
1214 PyErr_BadArgument();
1215 goto onError;
1216 }
1217 return PyUnicode_GET_SIZE(unicode);
1218
1219 onError:
1220 return -1;
1221}
1222
Thomas Wouters78890102000-07-22 19:25:51 +00001223const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001224{
1225 return unicode_default_encoding;
1226}
1227
1228int PyUnicode_SetDefaultEncoding(const char *encoding)
1229{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001230 if (strcmp(encoding, unicode_default_encoding) != 0) {
1231 PyErr_Format(PyExc_ValueError,
1232 "Can only set default encoding to %s",
1233 unicode_default_encoding);
1234 return -1;
1235 }
Fred Drakee4315f52000-05-09 19:53:39 +00001236 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001237}
1238
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001239/* error handling callback helper:
1240 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001241 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001242 and adjust various state variables.
1243 return 0 on success, -1 on error
1244*/
1245
1246static
1247int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1248 const char *encoding, const char *reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001249 const char **input, const char **inend, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001250 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001251{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001252 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001253
1254 PyObject *restuple = NULL;
1255 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001256 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001257 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001258 Py_ssize_t requiredsize;
1259 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001260 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001261 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001262 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001263 int res = -1;
1264
1265 if (*errorHandler == NULL) {
1266 *errorHandler = PyCodec_LookupError(errors);
1267 if (*errorHandler == NULL)
1268 goto onError;
1269 }
1270
1271 if (*exceptionObject == NULL) {
1272 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001273 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274 if (*exceptionObject == NULL)
1275 goto onError;
1276 }
1277 else {
1278 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1279 goto onError;
1280 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1281 goto onError;
1282 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1283 goto onError;
1284 }
1285
1286 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1287 if (restuple == NULL)
1288 goto onError;
1289 if (!PyTuple_Check(restuple)) {
1290 PyErr_Format(PyExc_TypeError, &argparse[4]);
1291 goto onError;
1292 }
1293 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1294 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001295
1296 /* Copy back the bytes variables, which might have been modified by the
1297 callback */
1298 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1299 if (!inputobj)
1300 goto onError;
1301 if (!PyBytes_Check(inputobj)) {
1302 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1303 }
1304 *input = PyBytes_AS_STRING(inputobj);
1305 insize = PyBytes_GET_SIZE(inputobj);
1306 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001307 /* we can DECREF safely, as the exception has another reference,
1308 so the object won't go away. */
1309 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001310
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001311 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001312 newpos = insize+newpos;
1313 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001314 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001315 goto onError;
1316 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001317
1318 /* need more space? (at least enough for what we
1319 have+the replacement+the rest of the string (starting
1320 at the new input position), so we won't have to check space
1321 when there are no errors in the rest of the string) */
1322 repptr = PyUnicode_AS_UNICODE(repunicode);
1323 repsize = PyUnicode_GET_SIZE(repunicode);
1324 requiredsize = *outpos + repsize + insize-newpos;
1325 if (requiredsize > outsize) {
1326 if (requiredsize<2*outsize)
1327 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001328 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001329 goto onError;
1330 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1331 }
1332 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001333 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001334 Py_UNICODE_COPY(*outptr, repptr, repsize);
1335 *outptr += repsize;
1336 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001337
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001338 /* we made it! */
1339 res = 0;
1340
1341 onError:
1342 Py_XDECREF(restuple);
1343 return res;
1344}
1345
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001346/* --- UTF-7 Codec -------------------------------------------------------- */
1347
1348/* see RFC2152 for details */
1349
Tim Petersced69f82003-09-16 20:30:58 +00001350static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001351char utf7_special[128] = {
1352 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1353 encoded:
1354 0 - not special
1355 1 - special
1356 2 - whitespace (optional)
1357 3 - RFC2152 Set O (optional) */
1358 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1359 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1360 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1361 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1362 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1363 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1364 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1365 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1366
1367};
1368
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001369/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1370 warnings about the comparison always being false; since
1371 utf7_special[0] is 1, we can safely make that one comparison
1372 true */
1373
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001374#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001375 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001376 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001377 (encodeO && (utf7_special[(c)] == 3)))
1378
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001379#define B64(n) \
1380 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1381#define B64CHAR(c) \
1382 (isalnum(c) || (c) == '+' || (c) == '/')
1383#define UB64(c) \
1384 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1385 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001386
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001387#define ENCODE(out, ch, bits) \
1388 while (bits >= 6) { \
1389 *out++ = B64(ch >> (bits-6)); \
1390 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001391 }
1392
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001393#define DECODE(out, ch, bits, surrogate) \
1394 while (bits >= 16) { \
1395 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1396 bits -= 16; \
1397 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001398 /* We have already generated an error for the high surrogate \
1399 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001400 surrogate = 0; \
1401 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001402 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001403 it in a 16-bit character */ \
1404 surrogate = 1; \
1405 errmsg = "code pairs are not supported"; \
1406 goto utf7Error; \
1407 } else { \
1408 *out++ = outCh; \
1409 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001410 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001411
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001412PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001414 const char *errors)
1415{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001417 Py_ssize_t startinpos;
1418 Py_ssize_t endinpos;
1419 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001420 const char *e;
1421 PyUnicodeObject *unicode;
1422 Py_UNICODE *p;
1423 const char *errmsg = "";
1424 int inShift = 0;
1425 unsigned int bitsleft = 0;
1426 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001427 int surrogate = 0;
1428 PyObject *errorHandler = NULL;
1429 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001430
1431 unicode = _PyUnicode_New(size);
1432 if (!unicode)
1433 return NULL;
1434 if (size == 0)
1435 return (PyObject *)unicode;
1436
1437 p = unicode->str;
1438 e = s + size;
1439
1440 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001441 Py_UNICODE ch;
1442 restart:
1443 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001444
1445 if (inShift) {
1446 if ((ch == '-') || !B64CHAR(ch)) {
1447 inShift = 0;
1448 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001449
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001450 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1451 if (bitsleft >= 6) {
1452 /* The shift sequence has a partial character in it. If
1453 bitsleft < 6 then we could just classify it as padding
1454 but that is not the case here */
1455
1456 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001457 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001458 }
1459 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001460 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001461 here so indicate the potential of a misencoded character. */
1462
1463 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1464 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1465 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001466 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001467 }
1468
1469 if (ch == '-') {
1470 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001471 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001472 inShift = 1;
1473 }
1474 } else if (SPECIAL(ch,0,0)) {
1475 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001476 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001477 } else {
1478 *p++ = ch;
1479 }
1480 } else {
1481 charsleft = (charsleft << 6) | UB64(ch);
1482 bitsleft += 6;
1483 s++;
1484 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1485 }
1486 }
1487 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001488 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001489 s++;
1490 if (s < e && *s == '-') {
1491 s++;
1492 *p++ = '+';
1493 } else
1494 {
1495 inShift = 1;
1496 bitsleft = 0;
1497 }
1498 }
1499 else if (SPECIAL(ch,0,0)) {
1500 errmsg = "unexpected special character";
1501 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001502 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001503 }
1504 else {
1505 *p++ = ch;
1506 s++;
1507 }
1508 continue;
1509 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001510 outpos = p-PyUnicode_AS_UNICODE(unicode);
1511 endinpos = s-starts;
1512 if (unicode_decode_call_errorhandler(
1513 errors, &errorHandler,
1514 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001515 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001516 (PyObject **)&unicode, &outpos, &p))
1517 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001518 }
1519
1520 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001521 outpos = p-PyUnicode_AS_UNICODE(unicode);
1522 endinpos = size;
1523 if (unicode_decode_call_errorhandler(
1524 errors, &errorHandler,
1525 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001526 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001527 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001528 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001529 if (s < e)
1530 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001531 }
1532
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001533 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001534 goto onError;
1535
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001536 Py_XDECREF(errorHandler);
1537 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001538 return (PyObject *)unicode;
1539
1540onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001541 Py_XDECREF(errorHandler);
1542 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001543 Py_DECREF(unicode);
1544 return NULL;
1545}
1546
1547
1548PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001549 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550 int encodeSetO,
1551 int encodeWhiteSpace,
1552 const char *errors)
1553{
1554 PyObject *v;
1555 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001556 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001558 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559 unsigned int bitsleft = 0;
1560 unsigned long charsleft = 0;
1561 char * out;
1562 char * start;
1563
1564 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001565 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001566
Walter Dörwald51ab4142007-05-05 14:43:36 +00001567 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001568 if (v == NULL)
1569 return NULL;
1570
Walter Dörwald51ab4142007-05-05 14:43:36 +00001571 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001572 for (;i < size; ++i) {
1573 Py_UNICODE ch = s[i];
1574
1575 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001576 if (ch == '+') {
1577 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001578 *out++ = '-';
1579 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1580 charsleft = ch;
1581 bitsleft = 16;
1582 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001583 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001584 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001585 } else {
1586 *out++ = (char) ch;
1587 }
1588 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001589 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1590 *out++ = B64(charsleft << (6-bitsleft));
1591 charsleft = 0;
1592 bitsleft = 0;
1593 /* Characters not in the BASE64 set implicitly unshift the sequence
1594 so no '-' is required, except if the character is itself a '-' */
1595 if (B64CHAR(ch) || ch == '-') {
1596 *out++ = '-';
1597 }
1598 inShift = 0;
1599 *out++ = (char) ch;
1600 } else {
1601 bitsleft += 16;
1602 charsleft = (charsleft << 16) | ch;
1603 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1604
1605 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001606 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607 or '-' then the shift sequence will be terminated implicitly and we
1608 don't have to insert a '-'. */
1609
1610 if (bitsleft == 0) {
1611 if (i + 1 < size) {
1612 Py_UNICODE ch2 = s[i+1];
1613
1614 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001615
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001616 } else if (B64CHAR(ch2) || ch2 == '-') {
1617 *out++ = '-';
1618 inShift = 0;
1619 } else {
1620 inShift = 0;
1621 }
1622
1623 }
1624 else {
1625 *out++ = '-';
1626 inShift = 0;
1627 }
1628 }
Tim Petersced69f82003-09-16 20:30:58 +00001629 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001630 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001631 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001632 if (bitsleft) {
1633 *out++= B64(charsleft << (6-bitsleft) );
1634 *out++ = '-';
1635 }
1636
Walter Dörwald51ab4142007-05-05 14:43:36 +00001637 if (PyBytes_Resize(v, out - start)) {
1638 Py_DECREF(v);
1639 return NULL;
1640 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001641 return v;
1642}
1643
1644#undef SPECIAL
1645#undef B64
1646#undef B64CHAR
1647#undef UB64
1648#undef ENCODE
1649#undef DECODE
1650
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651/* --- UTF-8 Codec -------------------------------------------------------- */
1652
Tim Petersced69f82003-09-16 20:30:58 +00001653static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001654char utf8_code_length[256] = {
1655 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1656 illegal prefix. see RFC 2279 for details */
1657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1658 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1659 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1660 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1661 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1662 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1663 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1664 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1669 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1670 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1671 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1672 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1673};
1674
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001676 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001677 const char *errors)
1678{
Walter Dörwald69652032004-09-07 20:24:22 +00001679 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1680}
1681
1682PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001683 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001684 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001685 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001686{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001687 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001689 Py_ssize_t startinpos;
1690 Py_ssize_t endinpos;
1691 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 const char *e;
1693 PyUnicodeObject *unicode;
1694 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001695 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001696 PyObject *errorHandler = NULL;
1697 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698
1699 /* Note: size will always be longer than the resulting Unicode
1700 character count */
1701 unicode = _PyUnicode_New(size);
1702 if (!unicode)
1703 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001704 if (size == 0) {
1705 if (consumed)
1706 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001709
1710 /* Unpack UTF-8 encoded data */
1711 p = unicode->str;
1712 e = s + size;
1713
1714 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001715 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716
1717 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001718 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719 s++;
1720 continue;
1721 }
1722
1723 n = utf8_code_length[ch];
1724
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001725 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001726 if (consumed)
1727 break;
1728 else {
1729 errmsg = "unexpected end of data";
1730 startinpos = s-starts;
1731 endinpos = size;
1732 goto utf8Error;
1733 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001734 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735
1736 switch (n) {
1737
1738 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001739 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001740 startinpos = s-starts;
1741 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001742 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743
1744 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001745 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 startinpos = s-starts;
1747 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001748 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749
1750 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001751 if ((s[1] & 0xc0) != 0x80) {
1752 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 startinpos = s-starts;
1754 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001755 goto utf8Error;
1756 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001758 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001759 startinpos = s-starts;
1760 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001761 errmsg = "illegal encoding";
1762 goto utf8Error;
1763 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001765 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 break;
1767
1768 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001769 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001770 (s[2] & 0xc0) != 0x80) {
1771 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001772 startinpos = s-starts;
1773 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001774 goto utf8Error;
1775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001777 if (ch < 0x0800) {
1778 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001779 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001780
1781 XXX For wide builds (UCS-4) we should probably try
1782 to recombine the surrogates into a single code
1783 unit.
1784 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001785 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001786 startinpos = s-starts;
1787 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001788 goto utf8Error;
1789 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001791 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001792 break;
1793
1794 case 4:
1795 if ((s[1] & 0xc0) != 0x80 ||
1796 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001797 (s[3] & 0xc0) != 0x80) {
1798 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 startinpos = s-starts;
1800 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001801 goto utf8Error;
1802 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001803 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1804 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1805 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001806 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001807 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001808 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001809 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001810 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001811 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001812 startinpos = s-starts;
1813 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001814 goto utf8Error;
1815 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001816#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001817 *p++ = (Py_UNICODE)ch;
1818#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001819 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001820
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001821 /* translate from 10000..10FFFF to 0..FFFF */
1822 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001823
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001824 /* high surrogate = top 10 bits added to D800 */
1825 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001826
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001827 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001828 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001829#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001830 break;
1831
1832 default:
1833 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001834 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 startinpos = s-starts;
1836 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001837 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 }
1839 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001840 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001841
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001842 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001843 outpos = p-PyUnicode_AS_UNICODE(unicode);
1844 if (unicode_decode_call_errorhandler(
1845 errors, &errorHandler,
1846 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001847 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001848 (PyObject **)&unicode, &outpos, &p))
1849 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850 }
Walter Dörwald69652032004-09-07 20:24:22 +00001851 if (consumed)
1852 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853
1854 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001855 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 goto onError;
1857
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 Py_XDECREF(errorHandler);
1859 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 return (PyObject *)unicode;
1861
1862onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 Py_XDECREF(errorHandler);
1864 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 Py_DECREF(unicode);
1866 return NULL;
1867}
1868
Tim Peters602f7402002-04-27 18:03:26 +00001869/* Allocation strategy: if the string is short, convert into a stack buffer
1870 and allocate exactly as much space needed at the end. Else allocate the
1871 maximum possible needed (4 result bytes per Unicode character), and return
1872 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001873*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001874PyObject *
1875PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001876 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001877 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878{
Tim Peters602f7402002-04-27 18:03:26 +00001879#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001880
Martin v. Löwis18e16552006-02-15 17:27:45 +00001881 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001882 PyObject *v; /* result string object */
1883 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001884 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001885 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001886 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001887
Tim Peters602f7402002-04-27 18:03:26 +00001888 assert(s != NULL);
1889 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890
Tim Peters602f7402002-04-27 18:03:26 +00001891 if (size <= MAX_SHORT_UNICHARS) {
1892 /* Write into the stack buffer; nallocated can't overflow.
1893 * At the end, we'll allocate exactly as much heap space as it
1894 * turns out we need.
1895 */
1896 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1897 v = NULL; /* will allocate after we're done */
1898 p = stackbuf;
1899 }
1900 else {
1901 /* Overallocate on the heap, and give the excess back at the end. */
1902 nallocated = size * 4;
1903 if (nallocated / 4 != size) /* overflow! */
1904 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001905 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001906 if (v == NULL)
1907 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001908 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001909 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001910
Tim Peters602f7402002-04-27 18:03:26 +00001911 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001912 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001913
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001914 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001915 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001916 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001917
Guido van Rossumd57fd912000-03-10 22:53:23 +00001918 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001919 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001920 *p++ = (char)(0xc0 | (ch >> 6));
1921 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001922 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001923 else {
Tim Peters602f7402002-04-27 18:03:26 +00001924 /* Encode UCS2 Unicode ordinals */
1925 if (ch < 0x10000) {
1926 /* Special case: check for high surrogate */
1927 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1928 Py_UCS4 ch2 = s[i];
1929 /* Check for low surrogate and combine the two to
1930 form a UCS4 value */
1931 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001932 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001933 i++;
1934 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001935 }
Tim Peters602f7402002-04-27 18:03:26 +00001936 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001937 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001938 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001939 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1940 *p++ = (char)(0x80 | (ch & 0x3f));
1941 continue;
1942 }
1943encodeUCS4:
1944 /* Encode UCS4 Unicode ordinals */
1945 *p++ = (char)(0xf0 | (ch >> 18));
1946 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1947 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1948 *p++ = (char)(0x80 | (ch & 0x3f));
1949 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001950 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001951
Tim Peters602f7402002-04-27 18:03:26 +00001952 if (v == NULL) {
1953 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001954 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001955 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001956 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001957 }
1958 else {
1959 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001960 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001961 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001962 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001965
Tim Peters602f7402002-04-27 18:03:26 +00001966#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967}
1968
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1970{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 if (!PyUnicode_Check(unicode)) {
1972 PyErr_BadArgument();
1973 return NULL;
1974 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001975 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1976 PyUnicode_GET_SIZE(unicode),
1977 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978}
1979
Walter Dörwald41980ca2007-08-16 21:55:45 +00001980/* --- UTF-32 Codec ------------------------------------------------------- */
1981
1982PyObject *
1983PyUnicode_DecodeUTF32(const char *s,
1984 Py_ssize_t size,
1985 const char *errors,
1986 int *byteorder)
1987{
1988 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
1989}
1990
1991PyObject *
1992PyUnicode_DecodeUTF32Stateful(const char *s,
1993 Py_ssize_t size,
1994 const char *errors,
1995 int *byteorder,
1996 Py_ssize_t *consumed)
1997{
1998 const char *starts = s;
1999 Py_ssize_t startinpos;
2000 Py_ssize_t endinpos;
2001 Py_ssize_t outpos;
2002 PyUnicodeObject *unicode;
2003 Py_UNICODE *p;
2004#ifndef Py_UNICODE_WIDE
2005 int i, pairs;
2006#else
2007 const int pairs = 0;
2008#endif
2009 const unsigned char *q, *e;
2010 int bo = 0; /* assume native ordering by default */
2011 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002012 /* Offsets from q for retrieving bytes in the right order. */
2013#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2014 int iorder[] = {0, 1, 2, 3};
2015#else
2016 int iorder[] = {3, 2, 1, 0};
2017#endif
2018 PyObject *errorHandler = NULL;
2019 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002020 /* On narrow builds we split characters outside the BMP into two
2021 codepoints => count how much extra space we need. */
2022#ifndef Py_UNICODE_WIDE
2023 for (i = pairs = 0; i < size/4; i++)
2024 if (((Py_UCS4 *)s)[i] >= 0x10000)
2025 pairs++;
2026#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002027
2028 /* This might be one to much, because of a BOM */
2029 unicode = _PyUnicode_New((size+3)/4+pairs);
2030 if (!unicode)
2031 return NULL;
2032 if (size == 0)
2033 return (PyObject *)unicode;
2034
2035 /* Unpack UTF-32 encoded data */
2036 p = unicode->str;
2037 q = (unsigned char *)s;
2038 e = q + size;
2039
2040 if (byteorder)
2041 bo = *byteorder;
2042
2043 /* Check for BOM marks (U+FEFF) in the input and adjust current
2044 byte order setting accordingly. In native mode, the leading BOM
2045 mark is skipped, in all other modes, it is copied to the output
2046 stream as-is (giving a ZWNBSP character). */
2047 if (bo == 0) {
2048 if (size >= 4) {
2049 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2050 (q[iorder[1]] << 8) | q[iorder[0]];
2051#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2052 if (bom == 0x0000FEFF) {
2053 q += 4;
2054 bo = -1;
2055 }
2056 else if (bom == 0xFFFE0000) {
2057 q += 4;
2058 bo = 1;
2059 }
2060#else
2061 if (bom == 0x0000FEFF) {
2062 q += 4;
2063 bo = 1;
2064 }
2065 else if (bom == 0xFFFE0000) {
2066 q += 4;
2067 bo = -1;
2068 }
2069#endif
2070 }
2071 }
2072
2073 if (bo == -1) {
2074 /* force LE */
2075 iorder[0] = 0;
2076 iorder[1] = 1;
2077 iorder[2] = 2;
2078 iorder[3] = 3;
2079 }
2080 else if (bo == 1) {
2081 /* force BE */
2082 iorder[0] = 3;
2083 iorder[1] = 2;
2084 iorder[2] = 1;
2085 iorder[3] = 0;
2086 }
2087
2088 while (q < e) {
2089 Py_UCS4 ch;
2090 /* remaining bytes at the end? (size should be divisible by 4) */
2091 if (e-q<4) {
2092 if (consumed)
2093 break;
2094 errmsg = "truncated data";
2095 startinpos = ((const char *)q)-starts;
2096 endinpos = ((const char *)e)-starts;
2097 goto utf32Error;
2098 /* The remaining input chars are ignored if the callback
2099 chooses to skip the input */
2100 }
2101 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2102 (q[iorder[1]] << 8) | q[iorder[0]];
2103
2104 if (ch >= 0x110000)
2105 {
2106 errmsg = "codepoint not in range(0x110000)";
2107 startinpos = ((const char *)q)-starts;
2108 endinpos = startinpos+4;
2109 goto utf32Error;
2110 }
2111#ifndef Py_UNICODE_WIDE
2112 if (ch >= 0x10000)
2113 {
2114 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2115 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2116 }
2117 else
2118#endif
2119 *p++ = ch;
2120 q += 4;
2121 continue;
2122 utf32Error:
2123 outpos = p-PyUnicode_AS_UNICODE(unicode);
2124 if (unicode_decode_call_errorhandler(
2125 errors, &errorHandler,
2126 "utf32", errmsg,
2127 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2128 (PyObject **)&unicode, &outpos, &p))
2129 goto onError;
2130 }
2131
2132 if (byteorder)
2133 *byteorder = bo;
2134
2135 if (consumed)
2136 *consumed = (const char *)q-starts;
2137
2138 /* Adjust length */
2139 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2140 goto onError;
2141
2142 Py_XDECREF(errorHandler);
2143 Py_XDECREF(exc);
2144 return (PyObject *)unicode;
2145
2146onError:
2147 Py_DECREF(unicode);
2148 Py_XDECREF(errorHandler);
2149 Py_XDECREF(exc);
2150 return NULL;
2151}
2152
2153PyObject *
2154PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2155 Py_ssize_t size,
2156 const char *errors,
2157 int byteorder)
2158{
2159 PyObject *v;
2160 unsigned char *p;
2161#ifndef Py_UNICODE_WIDE
2162 int i, pairs;
2163#else
2164 const int pairs = 0;
2165#endif
2166 /* Offsets from p for storing byte pairs in the right order. */
2167#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2168 int iorder[] = {0, 1, 2, 3};
2169#else
2170 int iorder[] = {3, 2, 1, 0};
2171#endif
2172
2173#define STORECHAR(CH) \
2174 do { \
2175 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2176 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2177 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2178 p[iorder[0]] = (CH) & 0xff; \
2179 p += 4; \
2180 } while(0)
2181
2182 /* In narrow builds we can output surrogate pairs as one codepoint,
2183 so we need less space. */
2184#ifndef Py_UNICODE_WIDE
2185 for (i = pairs = 0; i < size-1; i++)
2186 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2187 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2188 pairs++;
2189#endif
2190 v = PyBytes_FromStringAndSize(NULL,
2191 4 * (size - pairs + (byteorder == 0)));
2192 if (v == NULL)
2193 return NULL;
2194
2195 p = (unsigned char *)PyBytes_AS_STRING(v);
2196 if (byteorder == 0)
2197 STORECHAR(0xFEFF);
2198 if (size == 0)
2199 return v;
2200
2201 if (byteorder == -1) {
2202 /* force LE */
2203 iorder[0] = 0;
2204 iorder[1] = 1;
2205 iorder[2] = 2;
2206 iorder[3] = 3;
2207 }
2208 else if (byteorder == 1) {
2209 /* force BE */
2210 iorder[0] = 3;
2211 iorder[1] = 2;
2212 iorder[2] = 1;
2213 iorder[3] = 0;
2214 }
2215
2216 while (size-- > 0) {
2217 Py_UCS4 ch = *s++;
2218#ifndef Py_UNICODE_WIDE
2219 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2220 Py_UCS4 ch2 = *s;
2221 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2222 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2223 s++;
2224 size--;
2225 }
2226 }
2227#endif
2228 STORECHAR(ch);
2229 }
2230 return v;
2231#undef STORECHAR
2232}
2233
2234PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2235{
2236 if (!PyUnicode_Check(unicode)) {
2237 PyErr_BadArgument();
2238 return NULL;
2239 }
2240 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2241 PyUnicode_GET_SIZE(unicode),
2242 NULL,
2243 0);
2244}
2245
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246/* --- UTF-16 Codec ------------------------------------------------------- */
2247
Tim Peters772747b2001-08-09 22:21:55 +00002248PyObject *
2249PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002250 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002251 const char *errors,
2252 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002253{
Walter Dörwald69652032004-09-07 20:24:22 +00002254 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2255}
2256
2257PyObject *
2258PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002259 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002260 const char *errors,
2261 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002262 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002263{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002264 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002265 Py_ssize_t startinpos;
2266 Py_ssize_t endinpos;
2267 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 PyUnicodeObject *unicode;
2269 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002270 const unsigned char *q, *e;
2271 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002272 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002273 /* Offsets from q for retrieving byte pairs in the right order. */
2274#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2275 int ihi = 1, ilo = 0;
2276#else
2277 int ihi = 0, ilo = 1;
2278#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002279 PyObject *errorHandler = NULL;
2280 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281
2282 /* Note: size will always be longer than the resulting Unicode
2283 character count */
2284 unicode = _PyUnicode_New(size);
2285 if (!unicode)
2286 return NULL;
2287 if (size == 0)
2288 return (PyObject *)unicode;
2289
2290 /* Unpack UTF-16 encoded data */
2291 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002292 q = (unsigned char *)s;
2293 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294
2295 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002296 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002298 /* Check for BOM marks (U+FEFF) in the input and adjust current
2299 byte order setting accordingly. In native mode, the leading BOM
2300 mark is skipped, in all other modes, it is copied to the output
2301 stream as-is (giving a ZWNBSP character). */
2302 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002303 if (size >= 2) {
2304 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002305#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002306 if (bom == 0xFEFF) {
2307 q += 2;
2308 bo = -1;
2309 }
2310 else if (bom == 0xFFFE) {
2311 q += 2;
2312 bo = 1;
2313 }
Tim Petersced69f82003-09-16 20:30:58 +00002314#else
Walter Dörwald69652032004-09-07 20:24:22 +00002315 if (bom == 0xFEFF) {
2316 q += 2;
2317 bo = 1;
2318 }
2319 else if (bom == 0xFFFE) {
2320 q += 2;
2321 bo = -1;
2322 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002323#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002324 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326
Tim Peters772747b2001-08-09 22:21:55 +00002327 if (bo == -1) {
2328 /* force LE */
2329 ihi = 1;
2330 ilo = 0;
2331 }
2332 else if (bo == 1) {
2333 /* force BE */
2334 ihi = 0;
2335 ilo = 1;
2336 }
2337
2338 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002339 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002340 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002341 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002342 if (consumed)
2343 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002344 errmsg = "truncated data";
2345 startinpos = ((const char *)q)-starts;
2346 endinpos = ((const char *)e)-starts;
2347 goto utf16Error;
2348 /* The remaining input chars are ignored if the callback
2349 chooses to skip the input */
2350 }
2351 ch = (q[ihi] << 8) | q[ilo];
2352
Tim Peters772747b2001-08-09 22:21:55 +00002353 q += 2;
2354
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355 if (ch < 0xD800 || ch > 0xDFFF) {
2356 *p++ = ch;
2357 continue;
2358 }
2359
2360 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002361 if (q >= e) {
2362 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002363 startinpos = (((const char *)q)-2)-starts;
2364 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002365 goto utf16Error;
2366 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002367 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002368 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2369 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002370 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002371#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002372 *p++ = ch;
2373 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002374#else
2375 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002376#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002377 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002378 }
2379 else {
2380 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002381 startinpos = (((const char *)q)-4)-starts;
2382 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002383 goto utf16Error;
2384 }
2385
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002387 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002388 startinpos = (((const char *)q)-2)-starts;
2389 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002390 /* Fall through to report the error */
2391
2392 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002393 outpos = p-PyUnicode_AS_UNICODE(unicode);
2394 if (unicode_decode_call_errorhandler(
2395 errors, &errorHandler,
2396 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002397 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002399 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400 }
2401
2402 if (byteorder)
2403 *byteorder = bo;
2404
Walter Dörwald69652032004-09-07 20:24:22 +00002405 if (consumed)
2406 *consumed = (const char *)q-starts;
2407
Guido van Rossumd57fd912000-03-10 22:53:23 +00002408 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002409 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410 goto onError;
2411
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002412 Py_XDECREF(errorHandler);
2413 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002414 return (PyObject *)unicode;
2415
2416onError:
2417 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002418 Py_XDECREF(errorHandler);
2419 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002420 return NULL;
2421}
2422
Tim Peters772747b2001-08-09 22:21:55 +00002423PyObject *
2424PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002425 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002426 const char *errors,
2427 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002428{
2429 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002430 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002431#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002432 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002433#else
2434 const int pairs = 0;
2435#endif
Tim Peters772747b2001-08-09 22:21:55 +00002436 /* Offsets from p for storing byte pairs in the right order. */
2437#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2438 int ihi = 1, ilo = 0;
2439#else
2440 int ihi = 0, ilo = 1;
2441#endif
2442
2443#define STORECHAR(CH) \
2444 do { \
2445 p[ihi] = ((CH) >> 8) & 0xff; \
2446 p[ilo] = (CH) & 0xff; \
2447 p += 2; \
2448 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002449
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002450#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002451 for (i = pairs = 0; i < size; i++)
2452 if (s[i] >= 0x10000)
2453 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002454#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002455 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002456 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002457 if (v == NULL)
2458 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459
Walter Dörwald3cc34522007-05-04 10:48:27 +00002460 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002462 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002463 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002464 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002465
2466 if (byteorder == -1) {
2467 /* force LE */
2468 ihi = 1;
2469 ilo = 0;
2470 }
2471 else if (byteorder == 1) {
2472 /* force BE */
2473 ihi = 0;
2474 ilo = 1;
2475 }
2476
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002477 while (size-- > 0) {
2478 Py_UNICODE ch = *s++;
2479 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002480#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002481 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002482 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2483 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002485#endif
Tim Peters772747b2001-08-09 22:21:55 +00002486 STORECHAR(ch);
2487 if (ch2)
2488 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002491#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492}
2493
2494PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2495{
2496 if (!PyUnicode_Check(unicode)) {
2497 PyErr_BadArgument();
2498 return NULL;
2499 }
2500 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2501 PyUnicode_GET_SIZE(unicode),
2502 NULL,
2503 0);
2504}
2505
2506/* --- Unicode Escape Codec ----------------------------------------------- */
2507
Fredrik Lundh06d12682001-01-24 07:59:11 +00002508static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002509
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002511 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 const char *errors)
2513{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002514 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002515 Py_ssize_t startinpos;
2516 Py_ssize_t endinpos;
2517 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002518 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002520 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002522 char* message;
2523 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002524 PyObject *errorHandler = NULL;
2525 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002526
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 /* Escaped strings will always be longer than the resulting
2528 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002529 length after conversion to the true value.
2530 (but if the error callback returns a long replacement string
2531 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532 v = _PyUnicode_New(size);
2533 if (v == NULL)
2534 goto onError;
2535 if (size == 0)
2536 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002537
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002540
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541 while (s < end) {
2542 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002543 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545
2546 /* Non-escape characters are interpreted as Unicode ordinals */
2547 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002548 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 continue;
2550 }
2551
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002552 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 /* \ - Escapes */
2554 s++;
2555 switch (*s++) {
2556
2557 /* \x escapes */
2558 case '\n': break;
2559 case '\\': *p++ = '\\'; break;
2560 case '\'': *p++ = '\''; break;
2561 case '\"': *p++ = '\"'; break;
2562 case 'b': *p++ = '\b'; break;
2563 case 'f': *p++ = '\014'; break; /* FF */
2564 case 't': *p++ = '\t'; break;
2565 case 'n': *p++ = '\n'; break;
2566 case 'r': *p++ = '\r'; break;
2567 case 'v': *p++ = '\013'; break; /* VT */
2568 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2569
2570 /* \OOO (octal) escapes */
2571 case '0': case '1': case '2': case '3':
2572 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002573 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002575 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002577 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002579 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 break;
2581
Fredrik Lundhccc74732001-02-18 22:13:49 +00002582 /* hex escapes */
2583 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002585 digits = 2;
2586 message = "truncated \\xXX escape";
2587 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588
Fredrik Lundhccc74732001-02-18 22:13:49 +00002589 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002590 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002591 digits = 4;
2592 message = "truncated \\uXXXX escape";
2593 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594
Fredrik Lundhccc74732001-02-18 22:13:49 +00002595 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002596 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002597 digits = 8;
2598 message = "truncated \\UXXXXXXXX escape";
2599 hexescape:
2600 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002601 outpos = p-PyUnicode_AS_UNICODE(v);
2602 if (s+digits>end) {
2603 endinpos = size;
2604 if (unicode_decode_call_errorhandler(
2605 errors, &errorHandler,
2606 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002607 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002608 (PyObject **)&v, &outpos, &p))
2609 goto onError;
2610 goto nextByte;
2611 }
2612 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002613 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002614 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002615 endinpos = (s+i+1)-starts;
2616 if (unicode_decode_call_errorhandler(
2617 errors, &errorHandler,
2618 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002619 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002620 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002621 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002622 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002623 }
2624 chr = (chr<<4) & ~0xF;
2625 if (c >= '0' && c <= '9')
2626 chr += c - '0';
2627 else if (c >= 'a' && c <= 'f')
2628 chr += 10 + c - 'a';
2629 else
2630 chr += 10 + c - 'A';
2631 }
2632 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002633 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002634 /* _decoding_error will have already written into the
2635 target buffer. */
2636 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002637 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002638 /* when we get here, chr is a 32-bit unicode character */
2639 if (chr <= 0xffff)
2640 /* UCS-2 character */
2641 *p++ = (Py_UNICODE) chr;
2642 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002643 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002644 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002645#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002646 *p++ = chr;
2647#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002648 chr -= 0x10000L;
2649 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002650 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002651#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002652 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002653 endinpos = s-starts;
2654 outpos = p-PyUnicode_AS_UNICODE(v);
2655 if (unicode_decode_call_errorhandler(
2656 errors, &errorHandler,
2657 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002658 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002659 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002660 goto onError;
2661 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002662 break;
2663
2664 /* \N{name} */
2665 case 'N':
2666 message = "malformed \\N character escape";
2667 if (ucnhash_CAPI == NULL) {
2668 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002669 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002670 m = PyImport_ImportModule("unicodedata");
2671 if (m == NULL)
2672 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002673 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002674 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002675 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002676 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002677 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002678 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002679 if (ucnhash_CAPI == NULL)
2680 goto ucnhashError;
2681 }
2682 if (*s == '{') {
2683 const char *start = s+1;
2684 /* look for the closing brace */
2685 while (*s != '}' && s < end)
2686 s++;
2687 if (s > start && s < end && *s == '}') {
2688 /* found a name. look it up in the unicode database */
2689 message = "unknown Unicode character name";
2690 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002691 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002692 goto store;
2693 }
2694 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002695 endinpos = s-starts;
2696 outpos = p-PyUnicode_AS_UNICODE(v);
2697 if (unicode_decode_call_errorhandler(
2698 errors, &errorHandler,
2699 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002700 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002701 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002702 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002703 break;
2704
2705 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002706 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 message = "\\ at end of string";
2708 s--;
2709 endinpos = s-starts;
2710 outpos = p-PyUnicode_AS_UNICODE(v);
2711 if (unicode_decode_call_errorhandler(
2712 errors, &errorHandler,
2713 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002714 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002715 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002716 goto onError;
2717 }
2718 else {
2719 *p++ = '\\';
2720 *p++ = (unsigned char)s[-1];
2721 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002722 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724 nextByte:
2725 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002727 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002729 Py_XDECREF(errorHandler);
2730 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002732
Fredrik Lundhccc74732001-02-18 22:13:49 +00002733ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002734 PyErr_SetString(
2735 PyExc_UnicodeError,
2736 "\\N escapes not supported (can't load unicodedata module)"
2737 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002738 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002739 Py_XDECREF(errorHandler);
2740 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002741 return NULL;
2742
Fredrik Lundhccc74732001-02-18 22:13:49 +00002743onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002745 Py_XDECREF(errorHandler);
2746 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 return NULL;
2748}
2749
2750/* Return a Unicode-Escape string version of the Unicode object.
2751
2752 If quotes is true, the string is enclosed in u"" or u'' quotes as
2753 appropriate.
2754
2755*/
2756
Thomas Wouters477c8d52006-05-27 19:21:47 +00002757Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2758 Py_ssize_t size,
2759 Py_UNICODE ch)
2760{
2761 /* like wcschr, but doesn't stop at NULL characters */
2762
2763 while (size-- > 0) {
2764 if (*s == ch)
2765 return s;
2766 s++;
2767 }
2768
2769 return NULL;
2770}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002771
Walter Dörwald79e913e2007-05-12 11:08:06 +00002772static const char *hexdigits = "0123456789abcdef";
2773
2774PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2775 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776{
2777 PyObject *repr;
2778 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779
Thomas Wouters89f507f2006-12-13 04:49:30 +00002780 /* XXX(nnorwitz): rather than over-allocating, it would be
2781 better to choose a different scheme. Perhaps scan the
2782 first N-chars of the string and allocate based on that size.
2783 */
2784 /* Initial allocation is based on the longest-possible unichr
2785 escape.
2786
2787 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2788 unichr, so in this case it's the longest unichr escape. In
2789 narrow (UTF-16) builds this is five chars per source unichr
2790 since there are two unichrs in the surrogate pair, so in narrow
2791 (UTF-16) builds it's not the longest unichr escape.
2792
2793 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2794 so in the narrow (UTF-16) build case it's the longest unichr
2795 escape.
2796 */
2797
Walter Dörwald79e913e2007-05-12 11:08:06 +00002798 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002799#ifdef Py_UNICODE_WIDE
2800 + 10*size
2801#else
2802 + 6*size
2803#endif
2804 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 if (repr == NULL)
2806 return NULL;
2807
Walter Dörwald79e913e2007-05-12 11:08:06 +00002808 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 while (size-- > 0) {
2811 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002812
Walter Dörwald79e913e2007-05-12 11:08:06 +00002813 /* Escape backslashes */
2814 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 *p++ = '\\';
2816 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002817 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002818 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002819
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002820#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002821 /* Map 21-bit characters to '\U00xxxxxx' */
2822 else if (ch >= 0x10000) {
2823 *p++ = '\\';
2824 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002825 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2826 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2827 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2828 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2829 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2830 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2831 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2832 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002833 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002834 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002835#else
2836 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002837 else if (ch >= 0xD800 && ch < 0xDC00) {
2838 Py_UNICODE ch2;
2839 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002840
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002841 ch2 = *s++;
2842 size--;
2843 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2844 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2845 *p++ = '\\';
2846 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002847 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2848 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2849 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2850 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2851 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2852 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2853 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2854 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002855 continue;
2856 }
2857 /* Fall through: isolated surrogates are copied as-is */
2858 s--;
2859 size++;
2860 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002861#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002862
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002864 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865 *p++ = '\\';
2866 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002867 *p++ = hexdigits[(ch >> 12) & 0x000F];
2868 *p++ = hexdigits[(ch >> 8) & 0x000F];
2869 *p++ = hexdigits[(ch >> 4) & 0x000F];
2870 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002872
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002873 /* Map special whitespace to '\t', \n', '\r' */
2874 else if (ch == '\t') {
2875 *p++ = '\\';
2876 *p++ = 't';
2877 }
2878 else if (ch == '\n') {
2879 *p++ = '\\';
2880 *p++ = 'n';
2881 }
2882 else if (ch == '\r') {
2883 *p++ = '\\';
2884 *p++ = 'r';
2885 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002886
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002887 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002888 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002890 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002891 *p++ = hexdigits[(ch >> 4) & 0x000F];
2892 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002893 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002894
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895 /* Copy everything else as-is */
2896 else
2897 *p++ = (char) ch;
2898 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899
2900 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002901 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2902 Py_DECREF(repr);
2903 return NULL;
2904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 return repr;
2906}
2907
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2909{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002910 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911 if (!PyUnicode_Check(unicode)) {
2912 PyErr_BadArgument();
2913 return NULL;
2914 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002915 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2916 PyUnicode_GET_SIZE(unicode));
2917
2918 if (!s)
2919 return NULL;
2920 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2921 PyBytes_GET_SIZE(s));
2922 Py_DECREF(s);
2923 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924}
2925
2926/* --- Raw Unicode Escape Codec ------------------------------------------- */
2927
2928PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002929 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 const char *errors)
2931{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002932 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002933 Py_ssize_t startinpos;
2934 Py_ssize_t endinpos;
2935 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938 const char *end;
2939 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002940 PyObject *errorHandler = NULL;
2941 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002942
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943 /* Escaped strings will always be longer than the resulting
2944 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002945 length after conversion to the true value. (But decoding error
2946 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002947 v = _PyUnicode_New(size);
2948 if (v == NULL)
2949 goto onError;
2950 if (size == 0)
2951 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002952 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002953 end = s + size;
2954 while (s < end) {
2955 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002956 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002957 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002958 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959
2960 /* Non-escape characters are interpreted as Unicode ordinals */
2961 if (*s != '\\') {
2962 *p++ = (unsigned char)*s++;
2963 continue;
2964 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002965 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966
2967 /* \u-escapes are only interpreted iff the number of leading
2968 backslashes if odd */
2969 bs = s;
2970 for (;s < end;) {
2971 if (*s != '\\')
2972 break;
2973 *p++ = (unsigned char)*s++;
2974 }
2975 if (((s - bs) & 1) == 0 ||
2976 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002977 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 continue;
2979 }
2980 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002981 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 s++;
2983
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002984 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002986 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002987 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002989 endinpos = s-starts;
2990 if (unicode_decode_call_errorhandler(
2991 errors, &errorHandler,
2992 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002993 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002994 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 }
2998 x = (x<<4) & ~0xF;
2999 if (c >= '0' && c <= '9')
3000 x += c - '0';
3001 else if (c >= 'a' && c <= 'f')
3002 x += 10 + c - 'a';
3003 else
3004 x += 10 + c - 'A';
3005 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003006#ifndef Py_UNICODE_WIDE
3007 if (x > 0x10000) {
3008 if (unicode_decode_call_errorhandler(
3009 errors, &errorHandler,
3010 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003011 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003012 (PyObject **)&v, &outpos, &p))
3013 goto onError;
3014 }
3015#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003016 *p++ = x;
3017 nextByte:
3018 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003020 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003021 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003022 Py_XDECREF(errorHandler);
3023 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003025
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026 onError:
3027 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003028 Py_XDECREF(errorHandler);
3029 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 return NULL;
3031}
3032
3033PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003034 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035{
3036 PyObject *repr;
3037 char *p;
3038 char *q;
3039
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003040#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003041 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003042#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003043 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003044#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 if (repr == NULL)
3046 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003047 if (size == 0)
3048 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049
Walter Dörwald711005d2007-05-12 12:03:26 +00003050 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 while (size-- > 0) {
3052 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003053#ifdef Py_UNICODE_WIDE
3054 /* Map 32-bit characters to '\Uxxxxxxxx' */
3055 if (ch >= 0x10000) {
3056 *p++ = '\\';
3057 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003058 *p++ = hexdigits[(ch >> 28) & 0xf];
3059 *p++ = hexdigits[(ch >> 24) & 0xf];
3060 *p++ = hexdigits[(ch >> 20) & 0xf];
3061 *p++ = hexdigits[(ch >> 16) & 0xf];
3062 *p++ = hexdigits[(ch >> 12) & 0xf];
3063 *p++ = hexdigits[(ch >> 8) & 0xf];
3064 *p++ = hexdigits[(ch >> 4) & 0xf];
3065 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003066 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003067 else
3068#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 /* Map 16-bit characters to '\uxxxx' */
3070 if (ch >= 256) {
3071 *p++ = '\\';
3072 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003073 *p++ = hexdigits[(ch >> 12) & 0xf];
3074 *p++ = hexdigits[(ch >> 8) & 0xf];
3075 *p++ = hexdigits[(ch >> 4) & 0xf];
3076 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 }
3078 /* Copy everything else as-is */
3079 else
3080 *p++ = (char) ch;
3081 }
3082 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003083 if (PyBytes_Resize(repr, p - q)) {
3084 Py_DECREF(repr);
3085 return NULL;
3086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 return repr;
3088}
3089
3090PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3091{
Walter Dörwald711005d2007-05-12 12:03:26 +00003092 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003094 PyErr_BadArgument();
3095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003097 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3098 PyUnicode_GET_SIZE(unicode));
3099
3100 if (!s)
3101 return NULL;
3102 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3103 PyBytes_GET_SIZE(s));
3104 Py_DECREF(s);
3105 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106}
3107
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003108/* --- Unicode Internal Codec ------------------------------------------- */
3109
3110PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003111 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003112 const char *errors)
3113{
3114 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003115 Py_ssize_t startinpos;
3116 Py_ssize_t endinpos;
3117 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003118 PyUnicodeObject *v;
3119 Py_UNICODE *p;
3120 const char *end;
3121 const char *reason;
3122 PyObject *errorHandler = NULL;
3123 PyObject *exc = NULL;
3124
Neal Norwitzd43069c2006-01-08 01:12:10 +00003125#ifdef Py_UNICODE_WIDE
3126 Py_UNICODE unimax = PyUnicode_GetMax();
3127#endif
3128
Thomas Wouters89f507f2006-12-13 04:49:30 +00003129 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003130 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3131 if (v == NULL)
3132 goto onError;
3133 if (PyUnicode_GetSize((PyObject *)v) == 0)
3134 return (PyObject *)v;
3135 p = PyUnicode_AS_UNICODE(v);
3136 end = s + size;
3137
3138 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003139 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003140 /* We have to sanity check the raw data, otherwise doom looms for
3141 some malformed UCS-4 data. */
3142 if (
3143 #ifdef Py_UNICODE_WIDE
3144 *p > unimax || *p < 0 ||
3145 #endif
3146 end-s < Py_UNICODE_SIZE
3147 )
3148 {
3149 startinpos = s - starts;
3150 if (end-s < Py_UNICODE_SIZE) {
3151 endinpos = end-starts;
3152 reason = "truncated input";
3153 }
3154 else {
3155 endinpos = s - starts + Py_UNICODE_SIZE;
3156 reason = "illegal code point (> 0x10FFFF)";
3157 }
3158 outpos = p - PyUnicode_AS_UNICODE(v);
3159 if (unicode_decode_call_errorhandler(
3160 errors, &errorHandler,
3161 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003162 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003163 (PyObject **)&v, &outpos, &p)) {
3164 goto onError;
3165 }
3166 }
3167 else {
3168 p++;
3169 s += Py_UNICODE_SIZE;
3170 }
3171 }
3172
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003173 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003174 goto onError;
3175 Py_XDECREF(errorHandler);
3176 Py_XDECREF(exc);
3177 return (PyObject *)v;
3178
3179 onError:
3180 Py_XDECREF(v);
3181 Py_XDECREF(errorHandler);
3182 Py_XDECREF(exc);
3183 return NULL;
3184}
3185
Guido van Rossumd57fd912000-03-10 22:53:23 +00003186/* --- Latin-1 Codec ------------------------------------------------------ */
3187
3188PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003189 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003190 const char *errors)
3191{
3192 PyUnicodeObject *v;
3193 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003194
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003196 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003197 Py_UNICODE r = *(unsigned char*)s;
3198 return PyUnicode_FromUnicode(&r, 1);
3199 }
3200
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201 v = _PyUnicode_New(size);
3202 if (v == NULL)
3203 goto onError;
3204 if (size == 0)
3205 return (PyObject *)v;
3206 p = PyUnicode_AS_UNICODE(v);
3207 while (size-- > 0)
3208 *p++ = (unsigned char)*s++;
3209 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003210
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 onError:
3212 Py_XDECREF(v);
3213 return NULL;
3214}
3215
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216/* create or adjust a UnicodeEncodeError */
3217static void make_encode_exception(PyObject **exceptionObject,
3218 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003219 const Py_UNICODE *unicode, Py_ssize_t size,
3220 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003223 if (*exceptionObject == NULL) {
3224 *exceptionObject = PyUnicodeEncodeError_Create(
3225 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 }
3227 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003228 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3229 goto onError;
3230 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3231 goto onError;
3232 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3233 goto onError;
3234 return;
3235 onError:
3236 Py_DECREF(*exceptionObject);
3237 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 }
3239}
3240
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241/* raises a UnicodeEncodeError */
3242static void raise_encode_exception(PyObject **exceptionObject,
3243 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003244 const Py_UNICODE *unicode, Py_ssize_t size,
3245 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003246 const char *reason)
3247{
3248 make_encode_exception(exceptionObject,
3249 encoding, unicode, size, startpos, endpos, reason);
3250 if (*exceptionObject != NULL)
3251 PyCodec_StrictErrors(*exceptionObject);
3252}
3253
3254/* error handling callback helper:
3255 build arguments, call the callback and check the arguments,
3256 put the result into newpos and return the replacement string, which
3257 has to be freed by the caller */
3258static PyObject *unicode_encode_call_errorhandler(const char *errors,
3259 PyObject **errorHandler,
3260 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003261 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3262 Py_ssize_t startpos, Py_ssize_t endpos,
3263 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003264{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003265 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003266
3267 PyObject *restuple;
3268 PyObject *resunicode;
3269
3270 if (*errorHandler == NULL) {
3271 *errorHandler = PyCodec_LookupError(errors);
3272 if (*errorHandler == NULL)
3273 return NULL;
3274 }
3275
3276 make_encode_exception(exceptionObject,
3277 encoding, unicode, size, startpos, endpos, reason);
3278 if (*exceptionObject == NULL)
3279 return NULL;
3280
3281 restuple = PyObject_CallFunctionObjArgs(
3282 *errorHandler, *exceptionObject, NULL);
3283 if (restuple == NULL)
3284 return NULL;
3285 if (!PyTuple_Check(restuple)) {
3286 PyErr_Format(PyExc_TypeError, &argparse[4]);
3287 Py_DECREF(restuple);
3288 return NULL;
3289 }
3290 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3291 &resunicode, newpos)) {
3292 Py_DECREF(restuple);
3293 return NULL;
3294 }
3295 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003296 *newpos = size+*newpos;
3297 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003298 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003299 Py_DECREF(restuple);
3300 return NULL;
3301 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 Py_INCREF(resunicode);
3303 Py_DECREF(restuple);
3304 return resunicode;
3305}
3306
3307static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003308 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003309 const char *errors,
3310 int limit)
3311{
3312 /* output object */
3313 PyObject *res;
3314 /* pointers to the beginning and end+1 of input */
3315 const Py_UNICODE *startp = p;
3316 const Py_UNICODE *endp = p + size;
3317 /* pointer to the beginning of the unencodable characters */
3318 /* const Py_UNICODE *badp = NULL; */
3319 /* pointer into the output */
3320 char *str;
3321 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003322 Py_ssize_t respos = 0;
3323 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003324 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3325 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003326 PyObject *errorHandler = NULL;
3327 PyObject *exc = NULL;
3328 /* the following variable is used for caching string comparisons
3329 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3330 int known_errorHandler = -1;
3331
3332 /* allocate enough for a simple encoding without
3333 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003334 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335 if (res == NULL)
3336 goto onError;
3337 if (size == 0)
3338 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003339 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003340 ressize = size;
3341
3342 while (p<endp) {
3343 Py_UNICODE c = *p;
3344
3345 /* can we encode this? */
3346 if (c<limit) {
3347 /* no overflow check, because we know that the space is enough */
3348 *str++ = (char)c;
3349 ++p;
3350 }
3351 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003352 Py_ssize_t unicodepos = p-startp;
3353 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003354 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003355 Py_ssize_t repsize;
3356 Py_ssize_t newpos;
3357 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003358 Py_UNICODE *uni2;
3359 /* startpos for collecting unencodable chars */
3360 const Py_UNICODE *collstart = p;
3361 const Py_UNICODE *collend = p;
3362 /* find all unecodable characters */
3363 while ((collend < endp) && ((*collend)>=limit))
3364 ++collend;
3365 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3366 if (known_errorHandler==-1) {
3367 if ((errors==NULL) || (!strcmp(errors, "strict")))
3368 known_errorHandler = 1;
3369 else if (!strcmp(errors, "replace"))
3370 known_errorHandler = 2;
3371 else if (!strcmp(errors, "ignore"))
3372 known_errorHandler = 3;
3373 else if (!strcmp(errors, "xmlcharrefreplace"))
3374 known_errorHandler = 4;
3375 else
3376 known_errorHandler = 0;
3377 }
3378 switch (known_errorHandler) {
3379 case 1: /* strict */
3380 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3381 goto onError;
3382 case 2: /* replace */
3383 while (collstart++<collend)
3384 *str++ = '?'; /* fall through */
3385 case 3: /* ignore */
3386 p = collend;
3387 break;
3388 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003389 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 /* determine replacement size (temporarily (mis)uses p) */
3391 for (p = collstart, repsize = 0; p < collend; ++p) {
3392 if (*p<10)
3393 repsize += 2+1+1;
3394 else if (*p<100)
3395 repsize += 2+2+1;
3396 else if (*p<1000)
3397 repsize += 2+3+1;
3398 else if (*p<10000)
3399 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003400#ifndef Py_UNICODE_WIDE
3401 else
3402 repsize += 2+5+1;
3403#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003404 else if (*p<100000)
3405 repsize += 2+5+1;
3406 else if (*p<1000000)
3407 repsize += 2+6+1;
3408 else
3409 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003410#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 }
3412 requiredsize = respos+repsize+(endp-collend);
3413 if (requiredsize > ressize) {
3414 if (requiredsize<2*ressize)
3415 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003416 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003418 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 ressize = requiredsize;
3420 }
3421 /* generate replacement (temporarily (mis)uses p) */
3422 for (p = collstart; p < collend; ++p) {
3423 str += sprintf(str, "&#%d;", (int)*p);
3424 }
3425 p = collend;
3426 break;
3427 default:
3428 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3429 encoding, reason, startp, size, &exc,
3430 collstart-startp, collend-startp, &newpos);
3431 if (repunicode == NULL)
3432 goto onError;
3433 /* need more space? (at least enough for what we
3434 have+the replacement+the rest of the string, so
3435 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003436 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003437 repsize = PyUnicode_GET_SIZE(repunicode);
3438 requiredsize = respos+repsize+(endp-collend);
3439 if (requiredsize > ressize) {
3440 if (requiredsize<2*ressize)
3441 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003442 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003443 Py_DECREF(repunicode);
3444 goto onError;
3445 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003446 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003447 ressize = requiredsize;
3448 }
3449 /* check if there is anything unencodable in the replacement
3450 and copy it to the output */
3451 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3452 c = *uni2;
3453 if (c >= limit) {
3454 raise_encode_exception(&exc, encoding, startp, size,
3455 unicodepos, unicodepos+1, reason);
3456 Py_DECREF(repunicode);
3457 goto onError;
3458 }
3459 *str = (char)c;
3460 }
3461 p = startp + newpos;
3462 Py_DECREF(repunicode);
3463 }
3464 }
3465 }
3466 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003467 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003468 if (respos<ressize)
3469 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003470 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471 Py_XDECREF(errorHandler);
3472 Py_XDECREF(exc);
3473 return res;
3474
3475 onError:
3476 Py_XDECREF(res);
3477 Py_XDECREF(errorHandler);
3478 Py_XDECREF(exc);
3479 return NULL;
3480}
3481
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003483 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 const char *errors)
3485{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487}
3488
3489PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3490{
3491 if (!PyUnicode_Check(unicode)) {
3492 PyErr_BadArgument();
3493 return NULL;
3494 }
3495 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3496 PyUnicode_GET_SIZE(unicode),
3497 NULL);
3498}
3499
3500/* --- 7-bit ASCII Codec -------------------------------------------------- */
3501
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003503 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 const char *errors)
3505{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507 PyUnicodeObject *v;
3508 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003509 Py_ssize_t startinpos;
3510 Py_ssize_t endinpos;
3511 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 const char *e;
3513 PyObject *errorHandler = NULL;
3514 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003515
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003517 if (size == 1 && *(unsigned char*)s < 128) {
3518 Py_UNICODE r = *(unsigned char*)s;
3519 return PyUnicode_FromUnicode(&r, 1);
3520 }
Tim Petersced69f82003-09-16 20:30:58 +00003521
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522 v = _PyUnicode_New(size);
3523 if (v == NULL)
3524 goto onError;
3525 if (size == 0)
3526 return (PyObject *)v;
3527 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003528 e = s + size;
3529 while (s < e) {
3530 register unsigned char c = (unsigned char)*s;
3531 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 ++s;
3534 }
3535 else {
3536 startinpos = s-starts;
3537 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003538 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 if (unicode_decode_call_errorhandler(
3540 errors, &errorHandler,
3541 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003542 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003547 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003548 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003549 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550 Py_XDECREF(errorHandler);
3551 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003553
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 onError:
3555 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 Py_XDECREF(errorHandler);
3557 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 return NULL;
3559}
3560
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003562 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 const char *errors)
3564{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566}
3567
3568PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3569{
3570 if (!PyUnicode_Check(unicode)) {
3571 PyErr_BadArgument();
3572 return NULL;
3573 }
3574 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3575 PyUnicode_GET_SIZE(unicode),
3576 NULL);
3577}
3578
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003579#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003580
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003581/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003582
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003583#if SIZEOF_INT < SIZEOF_SSIZE_T
3584#define NEED_RETRY
3585#endif
3586
3587/* XXX This code is limited to "true" double-byte encodings, as
3588 a) it assumes an incomplete character consists of a single byte, and
3589 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3590 encodings, see IsDBCSLeadByteEx documentation. */
3591
3592static int is_dbcs_lead_byte(const char *s, int offset)
3593{
3594 const char *curr = s + offset;
3595
3596 if (IsDBCSLeadByte(*curr)) {
3597 const char *prev = CharPrev(s, curr);
3598 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3599 }
3600 return 0;
3601}
3602
3603/*
3604 * Decode MBCS string into unicode object. If 'final' is set, converts
3605 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3606 */
3607static int decode_mbcs(PyUnicodeObject **v,
3608 const char *s, /* MBCS string */
3609 int size, /* sizeof MBCS string */
3610 int final)
3611{
3612 Py_UNICODE *p;
3613 Py_ssize_t n = 0;
3614 int usize = 0;
3615
3616 assert(size >= 0);
3617
3618 /* Skip trailing lead-byte unless 'final' is set */
3619 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3620 --size;
3621
3622 /* First get the size of the result */
3623 if (size > 0) {
3624 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3625 if (usize == 0) {
3626 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3627 return -1;
3628 }
3629 }
3630
3631 if (*v == NULL) {
3632 /* Create unicode object */
3633 *v = _PyUnicode_New(usize);
3634 if (*v == NULL)
3635 return -1;
3636 }
3637 else {
3638 /* Extend unicode object */
3639 n = PyUnicode_GET_SIZE(*v);
3640 if (_PyUnicode_Resize(v, n + usize) < 0)
3641 return -1;
3642 }
3643
3644 /* Do the conversion */
3645 if (size > 0) {
3646 p = PyUnicode_AS_UNICODE(*v) + n;
3647 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3648 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3649 return -1;
3650 }
3651 }
3652
3653 return size;
3654}
3655
3656PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3657 Py_ssize_t size,
3658 const char *errors,
3659 Py_ssize_t *consumed)
3660{
3661 PyUnicodeObject *v = NULL;
3662 int done;
3663
3664 if (consumed)
3665 *consumed = 0;
3666
3667#ifdef NEED_RETRY
3668 retry:
3669 if (size > INT_MAX)
3670 done = decode_mbcs(&v, s, INT_MAX, 0);
3671 else
3672#endif
3673 done = decode_mbcs(&v, s, (int)size, !consumed);
3674
3675 if (done < 0) {
3676 Py_XDECREF(v);
3677 return NULL;
3678 }
3679
3680 if (consumed)
3681 *consumed += done;
3682
3683#ifdef NEED_RETRY
3684 if (size > INT_MAX) {
3685 s += done;
3686 size -= done;
3687 goto retry;
3688 }
3689#endif
3690
3691 return (PyObject *)v;
3692}
3693
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003694PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003695 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003696 const char *errors)
3697{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003698 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3699}
3700
3701/*
3702 * Convert unicode into string object (MBCS).
3703 * Returns 0 if succeed, -1 otherwise.
3704 */
3705static int encode_mbcs(PyObject **repr,
3706 const Py_UNICODE *p, /* unicode */
3707 int size) /* size of unicode */
3708{
3709 int mbcssize = 0;
3710 Py_ssize_t n = 0;
3711
3712 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003713
3714 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003715 if (size > 0) {
3716 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3717 if (mbcssize == 0) {
3718 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3719 return -1;
3720 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003721 }
3722
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003723 if (*repr == NULL) {
3724 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003725 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003726 if (*repr == NULL)
3727 return -1;
3728 }
3729 else {
3730 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003731 n = PyBytes_Size(*repr);
3732 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003733 return -1;
3734 }
3735
3736 /* Do the conversion */
3737 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003738 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003739 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3740 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3741 return -1;
3742 }
3743 }
3744
3745 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003746}
3747
3748PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003749 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003750 const char *errors)
3751{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003752 PyObject *repr = NULL;
3753 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003754
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003755#ifdef NEED_RETRY
3756 retry:
3757 if (size > INT_MAX)
3758 ret = encode_mbcs(&repr, p, INT_MAX);
3759 else
3760#endif
3761 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003762
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003763 if (ret < 0) {
3764 Py_XDECREF(repr);
3765 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003766 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003767
3768#ifdef NEED_RETRY
3769 if (size > INT_MAX) {
3770 p += INT_MAX;
3771 size -= INT_MAX;
3772 goto retry;
3773 }
3774#endif
3775
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003776 return repr;
3777}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003778
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003779PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3780{
3781 if (!PyUnicode_Check(unicode)) {
3782 PyErr_BadArgument();
3783 return NULL;
3784 }
3785 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3786 PyUnicode_GET_SIZE(unicode),
3787 NULL);
3788}
3789
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003790#undef NEED_RETRY
3791
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003792#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003793
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794/* --- Character Mapping Codec -------------------------------------------- */
3795
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003797 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798 PyObject *mapping,
3799 const char *errors)
3800{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003801 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003802 Py_ssize_t startinpos;
3803 Py_ssize_t endinpos;
3804 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003805 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 PyUnicodeObject *v;
3807 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003808 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003809 PyObject *errorHandler = NULL;
3810 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003811 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003812 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003813
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 /* Default to Latin-1 */
3815 if (mapping == NULL)
3816 return PyUnicode_DecodeLatin1(s, size, errors);
3817
3818 v = _PyUnicode_New(size);
3819 if (v == NULL)
3820 goto onError;
3821 if (size == 0)
3822 return (PyObject *)v;
3823 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003824 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003825 if (PyUnicode_CheckExact(mapping)) {
3826 mapstring = PyUnicode_AS_UNICODE(mapping);
3827 maplen = PyUnicode_GET_SIZE(mapping);
3828 while (s < e) {
3829 unsigned char ch = *s;
3830 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003832 if (ch < maplen)
3833 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003835 if (x == 0xfffe) {
3836 /* undefined mapping */
3837 outpos = p-PyUnicode_AS_UNICODE(v);
3838 startinpos = s-starts;
3839 endinpos = startinpos+1;
3840 if (unicode_decode_call_errorhandler(
3841 errors, &errorHandler,
3842 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003843 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003844 (PyObject **)&v, &outpos, &p)) {
3845 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003846 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003847 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003848 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003849 *p++ = x;
3850 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003852 }
3853 else {
3854 while (s < e) {
3855 unsigned char ch = *s;
3856 PyObject *w, *x;
3857
3858 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3859 w = PyInt_FromLong((long)ch);
3860 if (w == NULL)
3861 goto onError;
3862 x = PyObject_GetItem(mapping, w);
3863 Py_DECREF(w);
3864 if (x == NULL) {
3865 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3866 /* No mapping found means: mapping is undefined. */
3867 PyErr_Clear();
3868 x = Py_None;
3869 Py_INCREF(x);
3870 } else
3871 goto onError;
3872 }
3873
3874 /* Apply mapping */
3875 if (PyInt_Check(x)) {
3876 long value = PyInt_AS_LONG(x);
3877 if (value < 0 || value > 65535) {
3878 PyErr_SetString(PyExc_TypeError,
3879 "character mapping must be in range(65536)");
3880 Py_DECREF(x);
3881 goto onError;
3882 }
3883 *p++ = (Py_UNICODE)value;
3884 }
3885 else if (x == Py_None) {
3886 /* undefined mapping */
3887 outpos = p-PyUnicode_AS_UNICODE(v);
3888 startinpos = s-starts;
3889 endinpos = startinpos+1;
3890 if (unicode_decode_call_errorhandler(
3891 errors, &errorHandler,
3892 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003893 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003894 (PyObject **)&v, &outpos, &p)) {
3895 Py_DECREF(x);
3896 goto onError;
3897 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003898 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003899 continue;
3900 }
3901 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003902 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003903
3904 if (targetsize == 1)
3905 /* 1-1 mapping */
3906 *p++ = *PyUnicode_AS_UNICODE(x);
3907
3908 else if (targetsize > 1) {
3909 /* 1-n mapping */
3910 if (targetsize > extrachars) {
3911 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003912 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3913 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003914 (targetsize << 2);
3915 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003916 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003917 if (_PyUnicode_Resize(&v,
3918 PyUnicode_GET_SIZE(v) + needed) < 0) {
3919 Py_DECREF(x);
3920 goto onError;
3921 }
3922 p = PyUnicode_AS_UNICODE(v) + oldpos;
3923 }
3924 Py_UNICODE_COPY(p,
3925 PyUnicode_AS_UNICODE(x),
3926 targetsize);
3927 p += targetsize;
3928 extrachars -= targetsize;
3929 }
3930 /* 1-0 mapping: skip the character */
3931 }
3932 else {
3933 /* wrong return value */
3934 PyErr_SetString(PyExc_TypeError,
3935 "character mapping must return integer, None or unicode");
3936 Py_DECREF(x);
3937 goto onError;
3938 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003939 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003940 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942 }
3943 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003944 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946 Py_XDECREF(errorHandler);
3947 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003949
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 Py_XDECREF(errorHandler);
3952 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 Py_XDECREF(v);
3954 return NULL;
3955}
3956
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003957/* Charmap encoding: the lookup table */
3958
3959struct encoding_map{
3960 PyObject_HEAD
3961 unsigned char level1[32];
3962 int count2, count3;
3963 unsigned char level23[1];
3964};
3965
3966static PyObject*
3967encoding_map_size(PyObject *obj, PyObject* args)
3968{
3969 struct encoding_map *map = (struct encoding_map*)obj;
3970 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3971 128*map->count3);
3972}
3973
3974static PyMethodDef encoding_map_methods[] = {
3975 {"size", encoding_map_size, METH_NOARGS,
3976 PyDoc_STR("Return the size (in bytes) of this object") },
3977 { 0 }
3978};
3979
3980static void
3981encoding_map_dealloc(PyObject* o)
3982{
3983 PyObject_FREE(o);
3984}
3985
3986static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003987 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003988 "EncodingMap", /*tp_name*/
3989 sizeof(struct encoding_map), /*tp_basicsize*/
3990 0, /*tp_itemsize*/
3991 /* methods */
3992 encoding_map_dealloc, /*tp_dealloc*/
3993 0, /*tp_print*/
3994 0, /*tp_getattr*/
3995 0, /*tp_setattr*/
3996 0, /*tp_compare*/
3997 0, /*tp_repr*/
3998 0, /*tp_as_number*/
3999 0, /*tp_as_sequence*/
4000 0, /*tp_as_mapping*/
4001 0, /*tp_hash*/
4002 0, /*tp_call*/
4003 0, /*tp_str*/
4004 0, /*tp_getattro*/
4005 0, /*tp_setattro*/
4006 0, /*tp_as_buffer*/
4007 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4008 0, /*tp_doc*/
4009 0, /*tp_traverse*/
4010 0, /*tp_clear*/
4011 0, /*tp_richcompare*/
4012 0, /*tp_weaklistoffset*/
4013 0, /*tp_iter*/
4014 0, /*tp_iternext*/
4015 encoding_map_methods, /*tp_methods*/
4016 0, /*tp_members*/
4017 0, /*tp_getset*/
4018 0, /*tp_base*/
4019 0, /*tp_dict*/
4020 0, /*tp_descr_get*/
4021 0, /*tp_descr_set*/
4022 0, /*tp_dictoffset*/
4023 0, /*tp_init*/
4024 0, /*tp_alloc*/
4025 0, /*tp_new*/
4026 0, /*tp_free*/
4027 0, /*tp_is_gc*/
4028};
4029
4030PyObject*
4031PyUnicode_BuildEncodingMap(PyObject* string)
4032{
4033 Py_UNICODE *decode;
4034 PyObject *result;
4035 struct encoding_map *mresult;
4036 int i;
4037 int need_dict = 0;
4038 unsigned char level1[32];
4039 unsigned char level2[512];
4040 unsigned char *mlevel1, *mlevel2, *mlevel3;
4041 int count2 = 0, count3 = 0;
4042
4043 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4044 PyErr_BadArgument();
4045 return NULL;
4046 }
4047 decode = PyUnicode_AS_UNICODE(string);
4048 memset(level1, 0xFF, sizeof level1);
4049 memset(level2, 0xFF, sizeof level2);
4050
4051 /* If there isn't a one-to-one mapping of NULL to \0,
4052 or if there are non-BMP characters, we need to use
4053 a mapping dictionary. */
4054 if (decode[0] != 0)
4055 need_dict = 1;
4056 for (i = 1; i < 256; i++) {
4057 int l1, l2;
4058 if (decode[i] == 0
4059 #ifdef Py_UNICODE_WIDE
4060 || decode[i] > 0xFFFF
4061 #endif
4062 ) {
4063 need_dict = 1;
4064 break;
4065 }
4066 if (decode[i] == 0xFFFE)
4067 /* unmapped character */
4068 continue;
4069 l1 = decode[i] >> 11;
4070 l2 = decode[i] >> 7;
4071 if (level1[l1] == 0xFF)
4072 level1[l1] = count2++;
4073 if (level2[l2] == 0xFF)
4074 level2[l2] = count3++;
4075 }
4076
4077 if (count2 >= 0xFF || count3 >= 0xFF)
4078 need_dict = 1;
4079
4080 if (need_dict) {
4081 PyObject *result = PyDict_New();
4082 PyObject *key, *value;
4083 if (!result)
4084 return NULL;
4085 for (i = 0; i < 256; i++) {
4086 key = value = NULL;
4087 key = PyInt_FromLong(decode[i]);
4088 value = PyInt_FromLong(i);
4089 if (!key || !value)
4090 goto failed1;
4091 if (PyDict_SetItem(result, key, value) == -1)
4092 goto failed1;
4093 Py_DECREF(key);
4094 Py_DECREF(value);
4095 }
4096 return result;
4097 failed1:
4098 Py_XDECREF(key);
4099 Py_XDECREF(value);
4100 Py_DECREF(result);
4101 return NULL;
4102 }
4103
4104 /* Create a three-level trie */
4105 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4106 16*count2 + 128*count3 - 1);
4107 if (!result)
4108 return PyErr_NoMemory();
4109 PyObject_Init(result, &EncodingMapType);
4110 mresult = (struct encoding_map*)result;
4111 mresult->count2 = count2;
4112 mresult->count3 = count3;
4113 mlevel1 = mresult->level1;
4114 mlevel2 = mresult->level23;
4115 mlevel3 = mresult->level23 + 16*count2;
4116 memcpy(mlevel1, level1, 32);
4117 memset(mlevel2, 0xFF, 16*count2);
4118 memset(mlevel3, 0, 128*count3);
4119 count3 = 0;
4120 for (i = 1; i < 256; i++) {
4121 int o1, o2, o3, i2, i3;
4122 if (decode[i] == 0xFFFE)
4123 /* unmapped character */
4124 continue;
4125 o1 = decode[i]>>11;
4126 o2 = (decode[i]>>7) & 0xF;
4127 i2 = 16*mlevel1[o1] + o2;
4128 if (mlevel2[i2] == 0xFF)
4129 mlevel2[i2] = count3++;
4130 o3 = decode[i] & 0x7F;
4131 i3 = 128*mlevel2[i2] + o3;
4132 mlevel3[i3] = i;
4133 }
4134 return result;
4135}
4136
4137static int
4138encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4139{
4140 struct encoding_map *map = (struct encoding_map*)mapping;
4141 int l1 = c>>11;
4142 int l2 = (c>>7) & 0xF;
4143 int l3 = c & 0x7F;
4144 int i;
4145
4146#ifdef Py_UNICODE_WIDE
4147 if (c > 0xFFFF) {
4148 return -1;
4149 }
4150#endif
4151 if (c == 0)
4152 return 0;
4153 /* level 1*/
4154 i = map->level1[l1];
4155 if (i == 0xFF) {
4156 return -1;
4157 }
4158 /* level 2*/
4159 i = map->level23[16*i+l2];
4160 if (i == 0xFF) {
4161 return -1;
4162 }
4163 /* level 3 */
4164 i = map->level23[16*map->count2 + 128*i + l3];
4165 if (i == 0) {
4166 return -1;
4167 }
4168 return i;
4169}
4170
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171/* Lookup the character ch in the mapping. If the character
4172 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004173 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 PyObject *w = PyInt_FromLong((long)c);
4177 PyObject *x;
4178
4179 if (w == NULL)
4180 return NULL;
4181 x = PyObject_GetItem(mapping, w);
4182 Py_DECREF(w);
4183 if (x == NULL) {
4184 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4185 /* No mapping found means: mapping is undefined. */
4186 PyErr_Clear();
4187 x = Py_None;
4188 Py_INCREF(x);
4189 return x;
4190 } else
4191 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004193 else if (x == Py_None)
4194 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 else if (PyInt_Check(x)) {
4196 long value = PyInt_AS_LONG(x);
4197 if (value < 0 || value > 255) {
4198 PyErr_SetString(PyExc_TypeError,
4199 "character mapping must be in range(256)");
4200 Py_DECREF(x);
4201 return NULL;
4202 }
4203 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205 else if (PyString_Check(x))
4206 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004209 PyErr_Format(PyExc_TypeError,
4210 "character mapping must return integer, None or str8, not %.400s",
4211 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212 Py_DECREF(x);
4213 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 }
4215}
4216
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004217static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004218charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004219{
Walter Dörwald827b0552007-05-12 13:23:53 +00004220 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004221 /* exponentially overallocate to minimize reallocations */
4222 if (requiredsize < 2*outsize)
4223 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004224 if (PyBytes_Resize(outobj, requiredsize)) {
4225 Py_DECREF(outobj);
4226 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004227 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004228 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004229}
4230
4231typedef enum charmapencode_result {
4232 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4233}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004234/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004235 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 space is available. Return a new reference to the object that
4237 was put in the output buffer, or Py_None, if the mapping was undefined
4238 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004239 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004241charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004242 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004244 PyObject *rep;
4245 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004246 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004248 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004249 int res = encoding_map_lookup(c, mapping);
4250 Py_ssize_t requiredsize = *outpos+1;
4251 if (res == -1)
4252 return enc_FAILED;
4253 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004254 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004255 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004256 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004257 outstart[(*outpos)++] = (char)res;
4258 return enc_SUCCESS;
4259 }
4260
4261 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004263 return enc_EXCEPTION;
4264 else if (rep==Py_None) {
4265 Py_DECREF(rep);
4266 return enc_FAILED;
4267 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004269 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004270 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004271 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004273 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004275 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004276 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4277 }
4278 else {
4279 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004280 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4281 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004282 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004283 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004285 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004287 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288 memcpy(outstart + *outpos, repchars, repsize);
4289 *outpos += repsize;
4290 }
4291 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004292 Py_DECREF(rep);
4293 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294}
4295
4296/* handle an error in PyUnicode_EncodeCharmap
4297 Return 0 on success, -1 on error */
4298static
4299int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004300 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004302 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004303 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304{
4305 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004306 Py_ssize_t repsize;
4307 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004308 Py_UNICODE *uni2;
4309 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004310 Py_ssize_t collstartpos = *inpos;
4311 Py_ssize_t collendpos = *inpos+1;
4312 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004313 char *encoding = "charmap";
4314 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004315 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317 /* find all unencodable characters */
4318 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004319 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004320 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004321 int res = encoding_map_lookup(p[collendpos], mapping);
4322 if (res != -1)
4323 break;
4324 ++collendpos;
4325 continue;
4326 }
4327
4328 rep = charmapencode_lookup(p[collendpos], mapping);
4329 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004330 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004331 else if (rep!=Py_None) {
4332 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004333 break;
4334 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004335 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 ++collendpos;
4337 }
4338 /* cache callback name lookup
4339 * (if not done yet, i.e. it's the first error) */
4340 if (*known_errorHandler==-1) {
4341 if ((errors==NULL) || (!strcmp(errors, "strict")))
4342 *known_errorHandler = 1;
4343 else if (!strcmp(errors, "replace"))
4344 *known_errorHandler = 2;
4345 else if (!strcmp(errors, "ignore"))
4346 *known_errorHandler = 3;
4347 else if (!strcmp(errors, "xmlcharrefreplace"))
4348 *known_errorHandler = 4;
4349 else
4350 *known_errorHandler = 0;
4351 }
4352 switch (*known_errorHandler) {
4353 case 1: /* strict */
4354 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4355 return -1;
4356 case 2: /* replace */
4357 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4358 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004359 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360 return -1;
4361 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004362 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4364 return -1;
4365 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366 }
4367 /* fall through */
4368 case 3: /* ignore */
4369 *inpos = collendpos;
4370 break;
4371 case 4: /* xmlcharrefreplace */
4372 /* generate replacement (temporarily (mis)uses p) */
4373 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4374 char buffer[2+29+1+1];
4375 char *cp;
4376 sprintf(buffer, "&#%d;", (int)p[collpos]);
4377 for (cp = buffer; *cp; ++cp) {
4378 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004379 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004381 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004382 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4383 return -1;
4384 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385 }
4386 }
4387 *inpos = collendpos;
4388 break;
4389 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004390 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 encoding, reason, p, size, exceptionObject,
4392 collstartpos, collendpos, &newpos);
4393 if (repunicode == NULL)
4394 return -1;
4395 /* generate replacement */
4396 repsize = PyUnicode_GET_SIZE(repunicode);
4397 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4398 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004399 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 return -1;
4401 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004402 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4405 return -1;
4406 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 }
4408 *inpos = newpos;
4409 Py_DECREF(repunicode);
4410 }
4411 return 0;
4412}
4413
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004415 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 PyObject *mapping,
4417 const char *errors)
4418{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 /* output object */
4420 PyObject *res = NULL;
4421 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004422 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004424 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 PyObject *errorHandler = NULL;
4426 PyObject *exc = NULL;
4427 /* the following variable is used for caching string comparisons
4428 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4429 * 3=ignore, 4=xmlcharrefreplace */
4430 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431
4432 /* Default to Latin-1 */
4433 if (mapping == NULL)
4434 return PyUnicode_EncodeLatin1(p, size, errors);
4435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 /* allocate enough for a simple encoding without
4437 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004438 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 if (res == NULL)
4440 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004441 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 while (inpos<size) {
4445 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004446 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004447 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004449 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 if (charmap_encoding_error(p, size, &inpos, mapping,
4451 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004452 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004453 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004454 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004455 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457 else
4458 /* done with this character => adjust input position */
4459 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004463 if (respos<PyBytes_GET_SIZE(res)) {
4464 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 goto onError;
4466 }
4467 Py_XDECREF(exc);
4468 Py_XDECREF(errorHandler);
4469 return res;
4470
4471 onError:
4472 Py_XDECREF(res);
4473 Py_XDECREF(exc);
4474 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 return NULL;
4476}
4477
4478PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4479 PyObject *mapping)
4480{
4481 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4482 PyErr_BadArgument();
4483 return NULL;
4484 }
4485 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4486 PyUnicode_GET_SIZE(unicode),
4487 mapping,
4488 NULL);
4489}
4490
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491/* create or adjust a UnicodeTranslateError */
4492static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004493 const Py_UNICODE *unicode, Py_ssize_t size,
4494 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497 if (*exceptionObject == NULL) {
4498 *exceptionObject = PyUnicodeTranslateError_Create(
4499 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 }
4501 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004502 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4503 goto onError;
4504 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4505 goto onError;
4506 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4507 goto onError;
4508 return;
4509 onError:
4510 Py_DECREF(*exceptionObject);
4511 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512 }
4513}
4514
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515/* raises a UnicodeTranslateError */
4516static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004517 const Py_UNICODE *unicode, Py_ssize_t size,
4518 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004519 const char *reason)
4520{
4521 make_translate_exception(exceptionObject,
4522 unicode, size, startpos, endpos, reason);
4523 if (*exceptionObject != NULL)
4524 PyCodec_StrictErrors(*exceptionObject);
4525}
4526
4527/* error handling callback helper:
4528 build arguments, call the callback and check the arguments,
4529 put the result into newpos and return the replacement string, which
4530 has to be freed by the caller */
4531static PyObject *unicode_translate_call_errorhandler(const char *errors,
4532 PyObject **errorHandler,
4533 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004534 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4535 Py_ssize_t startpos, Py_ssize_t endpos,
4536 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004538 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004540 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 PyObject *restuple;
4542 PyObject *resunicode;
4543
4544 if (*errorHandler == NULL) {
4545 *errorHandler = PyCodec_LookupError(errors);
4546 if (*errorHandler == NULL)
4547 return NULL;
4548 }
4549
4550 make_translate_exception(exceptionObject,
4551 unicode, size, startpos, endpos, reason);
4552 if (*exceptionObject == NULL)
4553 return NULL;
4554
4555 restuple = PyObject_CallFunctionObjArgs(
4556 *errorHandler, *exceptionObject, NULL);
4557 if (restuple == NULL)
4558 return NULL;
4559 if (!PyTuple_Check(restuple)) {
4560 PyErr_Format(PyExc_TypeError, &argparse[4]);
4561 Py_DECREF(restuple);
4562 return NULL;
4563 }
4564 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004565 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 Py_DECREF(restuple);
4567 return NULL;
4568 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004569 if (i_newpos<0)
4570 *newpos = size+i_newpos;
4571 else
4572 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004573 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004574 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004575 Py_DECREF(restuple);
4576 return NULL;
4577 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578 Py_INCREF(resunicode);
4579 Py_DECREF(restuple);
4580 return resunicode;
4581}
4582
4583/* Lookup the character ch in the mapping and put the result in result,
4584 which must be decrefed by the caller.
4585 Return 0 on success, -1 on error */
4586static
4587int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4588{
4589 PyObject *w = PyInt_FromLong((long)c);
4590 PyObject *x;
4591
4592 if (w == NULL)
4593 return -1;
4594 x = PyObject_GetItem(mapping, w);
4595 Py_DECREF(w);
4596 if (x == NULL) {
4597 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4598 /* No mapping found means: use 1:1 mapping. */
4599 PyErr_Clear();
4600 *result = NULL;
4601 return 0;
4602 } else
4603 return -1;
4604 }
4605 else if (x == Py_None) {
4606 *result = x;
4607 return 0;
4608 }
4609 else if (PyInt_Check(x)) {
4610 long value = PyInt_AS_LONG(x);
4611 long max = PyUnicode_GetMax();
4612 if (value < 0 || value > max) {
4613 PyErr_Format(PyExc_TypeError,
4614 "character mapping must be in range(0x%lx)", max+1);
4615 Py_DECREF(x);
4616 return -1;
4617 }
4618 *result = x;
4619 return 0;
4620 }
4621 else if (PyUnicode_Check(x)) {
4622 *result = x;
4623 return 0;
4624 }
4625 else {
4626 /* wrong return value */
4627 PyErr_SetString(PyExc_TypeError,
4628 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004629 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004630 return -1;
4631 }
4632}
4633/* ensure that *outobj is at least requiredsize characters long,
4634if not reallocate and adjust various state variables.
4635Return 0 on success, -1 on error */
4636static
Walter Dörwald4894c302003-10-24 14:25:28 +00004637int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004638 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004640 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004641 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004643 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004645 if (requiredsize < 2 * oldsize)
4646 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004647 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004648 return -1;
4649 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 }
4651 return 0;
4652}
4653/* lookup the character, put the result in the output string and adjust
4654 various state variables. Return a new reference to the object that
4655 was put in the output buffer in *result, or Py_None, if the mapping was
4656 undefined (in which case no character was written).
4657 The called must decref result.
4658 Return 0 on success, -1 on error. */
4659static
Walter Dörwald4894c302003-10-24 14:25:28 +00004660int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004661 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004662 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663{
Walter Dörwald4894c302003-10-24 14:25:28 +00004664 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004665 return -1;
4666 if (*res==NULL) {
4667 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004668 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004669 }
4670 else if (*res==Py_None)
4671 ;
4672 else if (PyInt_Check(*res)) {
4673 /* no overflow check, because we know that the space is enough */
4674 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4675 }
4676 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004677 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678 if (repsize==1) {
4679 /* no overflow check, because we know that the space is enough */
4680 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4681 }
4682 else if (repsize!=0) {
4683 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004684 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004685 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004686 repsize - 1;
4687 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004688 return -1;
4689 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4690 *outp += repsize;
4691 }
4692 }
4693 else
4694 return -1;
4695 return 0;
4696}
4697
4698PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004699 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700 PyObject *mapping,
4701 const char *errors)
4702{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 /* output object */
4704 PyObject *res = NULL;
4705 /* pointers to the beginning and end+1 of input */
4706 const Py_UNICODE *startp = p;
4707 const Py_UNICODE *endp = p + size;
4708 /* pointer into the output */
4709 Py_UNICODE *str;
4710 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004711 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 char *reason = "character maps to <undefined>";
4713 PyObject *errorHandler = NULL;
4714 PyObject *exc = NULL;
4715 /* the following variable is used for caching string comparisons
4716 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4717 * 3=ignore, 4=xmlcharrefreplace */
4718 int known_errorHandler = -1;
4719
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720 if (mapping == NULL) {
4721 PyErr_BadArgument();
4722 return NULL;
4723 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004724
4725 /* allocate enough for a simple 1:1 translation without
4726 replacements, if we need more, we'll resize */
4727 res = PyUnicode_FromUnicode(NULL, size);
4728 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004729 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004731 return res;
4732 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004734 while (p<endp) {
4735 /* try to encode it */
4736 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004737 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004738 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 goto onError;
4740 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004741 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 if (x!=Py_None) /* it worked => adjust input pointer */
4743 ++p;
4744 else { /* untranslatable character */
4745 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004746 Py_ssize_t repsize;
4747 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748 Py_UNICODE *uni2;
4749 /* startpos for collecting untranslatable chars */
4750 const Py_UNICODE *collstart = p;
4751 const Py_UNICODE *collend = p+1;
4752 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 /* find all untranslatable characters */
4755 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004756 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004757 goto onError;
4758 Py_XDECREF(x);
4759 if (x!=Py_None)
4760 break;
4761 ++collend;
4762 }
4763 /* cache callback name lookup
4764 * (if not done yet, i.e. it's the first error) */
4765 if (known_errorHandler==-1) {
4766 if ((errors==NULL) || (!strcmp(errors, "strict")))
4767 known_errorHandler = 1;
4768 else if (!strcmp(errors, "replace"))
4769 known_errorHandler = 2;
4770 else if (!strcmp(errors, "ignore"))
4771 known_errorHandler = 3;
4772 else if (!strcmp(errors, "xmlcharrefreplace"))
4773 known_errorHandler = 4;
4774 else
4775 known_errorHandler = 0;
4776 }
4777 switch (known_errorHandler) {
4778 case 1: /* strict */
4779 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4780 goto onError;
4781 case 2: /* replace */
4782 /* No need to check for space, this is a 1:1 replacement */
4783 for (coll = collstart; coll<collend; ++coll)
4784 *str++ = '?';
4785 /* fall through */
4786 case 3: /* ignore */
4787 p = collend;
4788 break;
4789 case 4: /* xmlcharrefreplace */
4790 /* generate replacement (temporarily (mis)uses p) */
4791 for (p = collstart; p < collend; ++p) {
4792 char buffer[2+29+1+1];
4793 char *cp;
4794 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004795 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004796 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4797 goto onError;
4798 for (cp = buffer; *cp; ++cp)
4799 *str++ = *cp;
4800 }
4801 p = collend;
4802 break;
4803 default:
4804 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4805 reason, startp, size, &exc,
4806 collstart-startp, collend-startp, &newpos);
4807 if (repunicode == NULL)
4808 goto onError;
4809 /* generate replacement */
4810 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004811 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4813 Py_DECREF(repunicode);
4814 goto onError;
4815 }
4816 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4817 *str++ = *uni2;
4818 p = startp + newpos;
4819 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 }
4821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 /* Resize if we allocated to much */
4824 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004825 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004826 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004827 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828 }
4829 Py_XDECREF(exc);
4830 Py_XDECREF(errorHandler);
4831 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 onError:
4834 Py_XDECREF(res);
4835 Py_XDECREF(exc);
4836 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837 return NULL;
4838}
4839
4840PyObject *PyUnicode_Translate(PyObject *str,
4841 PyObject *mapping,
4842 const char *errors)
4843{
4844 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 str = PyUnicode_FromObject(str);
4847 if (str == NULL)
4848 goto onError;
4849 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4850 PyUnicode_GET_SIZE(str),
4851 mapping,
4852 errors);
4853 Py_DECREF(str);
4854 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004855
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 onError:
4857 Py_XDECREF(str);
4858 return NULL;
4859}
Tim Petersced69f82003-09-16 20:30:58 +00004860
Guido van Rossum9e896b32000-04-05 20:11:21 +00004861/* --- Decimal Encoder ---------------------------------------------------- */
4862
4863int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004864 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004865 char *output,
4866 const char *errors)
4867{
4868 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004869 PyObject *errorHandler = NULL;
4870 PyObject *exc = NULL;
4871 const char *encoding = "decimal";
4872 const char *reason = "invalid decimal Unicode string";
4873 /* the following variable is used for caching string comparisons
4874 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4875 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004876
4877 if (output == NULL) {
4878 PyErr_BadArgument();
4879 return -1;
4880 }
4881
4882 p = s;
4883 end = s + length;
4884 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004885 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004886 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004888 Py_ssize_t repsize;
4889 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004890 Py_UNICODE *uni2;
4891 Py_UNICODE *collstart;
4892 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004893
Guido van Rossum9e896b32000-04-05 20:11:21 +00004894 if (Py_UNICODE_ISSPACE(ch)) {
4895 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004897 continue;
4898 }
4899 decimal = Py_UNICODE_TODECIMAL(ch);
4900 if (decimal >= 0) {
4901 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004902 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004903 continue;
4904 }
Guido van Rossumba477042000-04-06 18:18:10 +00004905 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004906 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004908 continue;
4909 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004910 /* All other characters are considered unencodable */
4911 collstart = p;
4912 collend = p+1;
4913 while (collend < end) {
4914 if ((0 < *collend && *collend < 256) ||
4915 !Py_UNICODE_ISSPACE(*collend) ||
4916 Py_UNICODE_TODECIMAL(*collend))
4917 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004918 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004919 /* cache callback name lookup
4920 * (if not done yet, i.e. it's the first error) */
4921 if (known_errorHandler==-1) {
4922 if ((errors==NULL) || (!strcmp(errors, "strict")))
4923 known_errorHandler = 1;
4924 else if (!strcmp(errors, "replace"))
4925 known_errorHandler = 2;
4926 else if (!strcmp(errors, "ignore"))
4927 known_errorHandler = 3;
4928 else if (!strcmp(errors, "xmlcharrefreplace"))
4929 known_errorHandler = 4;
4930 else
4931 known_errorHandler = 0;
4932 }
4933 switch (known_errorHandler) {
4934 case 1: /* strict */
4935 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4936 goto onError;
4937 case 2: /* replace */
4938 for (p = collstart; p < collend; ++p)
4939 *output++ = '?';
4940 /* fall through */
4941 case 3: /* ignore */
4942 p = collend;
4943 break;
4944 case 4: /* xmlcharrefreplace */
4945 /* generate replacement (temporarily (mis)uses p) */
4946 for (p = collstart; p < collend; ++p)
4947 output += sprintf(output, "&#%d;", (int)*p);
4948 p = collend;
4949 break;
4950 default:
4951 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4952 encoding, reason, s, length, &exc,
4953 collstart-s, collend-s, &newpos);
4954 if (repunicode == NULL)
4955 goto onError;
4956 /* generate replacement */
4957 repsize = PyUnicode_GET_SIZE(repunicode);
4958 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4959 Py_UNICODE ch = *uni2;
4960 if (Py_UNICODE_ISSPACE(ch))
4961 *output++ = ' ';
4962 else {
4963 decimal = Py_UNICODE_TODECIMAL(ch);
4964 if (decimal >= 0)
4965 *output++ = '0' + decimal;
4966 else if (0 < ch && ch < 256)
4967 *output++ = (char)ch;
4968 else {
4969 Py_DECREF(repunicode);
4970 raise_encode_exception(&exc, encoding,
4971 s, length, collstart-s, collend-s, reason);
4972 goto onError;
4973 }
4974 }
4975 }
4976 p = s + newpos;
4977 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004978 }
4979 }
4980 /* 0-terminate the output string */
4981 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004982 Py_XDECREF(exc);
4983 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004984 return 0;
4985
4986 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004987 Py_XDECREF(exc);
4988 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004989 return -1;
4990}
4991
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992/* --- Helpers ------------------------------------------------------------ */
4993
Eric Smith8c663262007-08-25 02:26:07 +00004994#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00004995
4996#include "stringlib/fastsearch.h"
4997
4998#include "stringlib/count.h"
4999#include "stringlib/find.h"
5000#include "stringlib/partition.h"
5001
5002/* helper macro to fixup start/end slice values */
5003#define FIX_START_END(obj) \
5004 if (start < 0) \
5005 start += (obj)->length; \
5006 if (start < 0) \
5007 start = 0; \
5008 if (end > (obj)->length) \
5009 end = (obj)->length; \
5010 if (end < 0) \
5011 end += (obj)->length; \
5012 if (end < 0) \
5013 end = 0;
5014
Martin v. Löwis18e16552006-02-15 17:27:45 +00005015Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005016 PyObject *substr,
5017 Py_ssize_t start,
5018 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005020 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005021 PyUnicodeObject* str_obj;
5022 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005023
Thomas Wouters477c8d52006-05-27 19:21:47 +00005024 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5025 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005026 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005027 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5028 if (!sub_obj) {
5029 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030 return -1;
5031 }
Tim Petersced69f82003-09-16 20:30:58 +00005032
Thomas Wouters477c8d52006-05-27 19:21:47 +00005033 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005034
Thomas Wouters477c8d52006-05-27 19:21:47 +00005035 result = stringlib_count(
5036 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5037 );
5038
5039 Py_DECREF(sub_obj);
5040 Py_DECREF(str_obj);
5041
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042 return result;
5043}
5044
Martin v. Löwis18e16552006-02-15 17:27:45 +00005045Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005046 PyObject *sub,
5047 Py_ssize_t start,
5048 Py_ssize_t end,
5049 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005051 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005052
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005054 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005055 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005056 sub = PyUnicode_FromObject(sub);
5057 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005058 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005059 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060 }
Tim Petersced69f82003-09-16 20:30:58 +00005061
Thomas Wouters477c8d52006-05-27 19:21:47 +00005062 if (direction > 0)
5063 result = stringlib_find_slice(
5064 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5065 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5066 start, end
5067 );
5068 else
5069 result = stringlib_rfind_slice(
5070 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5071 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5072 start, end
5073 );
5074
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005076 Py_DECREF(sub);
5077
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 return result;
5079}
5080
Tim Petersced69f82003-09-16 20:30:58 +00005081static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082int tailmatch(PyUnicodeObject *self,
5083 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005084 Py_ssize_t start,
5085 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086 int direction)
5087{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088 if (substring->length == 0)
5089 return 1;
5090
Thomas Wouters477c8d52006-05-27 19:21:47 +00005091 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092
5093 end -= substring->length;
5094 if (end < start)
5095 return 0;
5096
5097 if (direction > 0) {
5098 if (Py_UNICODE_MATCH(self, end, substring))
5099 return 1;
5100 } else {
5101 if (Py_UNICODE_MATCH(self, start, substring))
5102 return 1;
5103 }
5104
5105 return 0;
5106}
5107
Martin v. Löwis18e16552006-02-15 17:27:45 +00005108Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005110 Py_ssize_t start,
5111 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 int direction)
5113{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005114 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005115
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116 str = PyUnicode_FromObject(str);
5117 if (str == NULL)
5118 return -1;
5119 substr = PyUnicode_FromObject(substr);
5120 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005121 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 return -1;
5123 }
Tim Petersced69f82003-09-16 20:30:58 +00005124
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125 result = tailmatch((PyUnicodeObject *)str,
5126 (PyUnicodeObject *)substr,
5127 start, end, direction);
5128 Py_DECREF(str);
5129 Py_DECREF(substr);
5130 return result;
5131}
5132
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133/* Apply fixfct filter to the Unicode object self and return a
5134 reference to the modified object */
5135
Tim Petersced69f82003-09-16 20:30:58 +00005136static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137PyObject *fixup(PyUnicodeObject *self,
5138 int (*fixfct)(PyUnicodeObject *s))
5139{
5140
5141 PyUnicodeObject *u;
5142
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005143 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144 if (u == NULL)
5145 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005146
5147 Py_UNICODE_COPY(u->str, self->str, self->length);
5148
Tim Peters7a29bd52001-09-12 03:03:31 +00005149 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 /* fixfct should return TRUE if it modified the buffer. If
5151 FALSE, return a reference to the original buffer instead
5152 (to save space, not time) */
5153 Py_INCREF(self);
5154 Py_DECREF(u);
5155 return (PyObject*) self;
5156 }
5157 return (PyObject*) u;
5158}
5159
Tim Petersced69f82003-09-16 20:30:58 +00005160static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161int fixupper(PyUnicodeObject *self)
5162{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005163 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 Py_UNICODE *s = self->str;
5165 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005166
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 while (len-- > 0) {
5168 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005169
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 ch = Py_UNICODE_TOUPPER(*s);
5171 if (ch != *s) {
5172 status = 1;
5173 *s = ch;
5174 }
5175 s++;
5176 }
5177
5178 return status;
5179}
5180
Tim Petersced69f82003-09-16 20:30:58 +00005181static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182int fixlower(PyUnicodeObject *self)
5183{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005184 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 Py_UNICODE *s = self->str;
5186 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005187
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 while (len-- > 0) {
5189 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005190
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 ch = Py_UNICODE_TOLOWER(*s);
5192 if (ch != *s) {
5193 status = 1;
5194 *s = ch;
5195 }
5196 s++;
5197 }
5198
5199 return status;
5200}
5201
Tim Petersced69f82003-09-16 20:30:58 +00005202static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203int fixswapcase(PyUnicodeObject *self)
5204{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005205 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 Py_UNICODE *s = self->str;
5207 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005208
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 while (len-- > 0) {
5210 if (Py_UNICODE_ISUPPER(*s)) {
5211 *s = Py_UNICODE_TOLOWER(*s);
5212 status = 1;
5213 } else if (Py_UNICODE_ISLOWER(*s)) {
5214 *s = Py_UNICODE_TOUPPER(*s);
5215 status = 1;
5216 }
5217 s++;
5218 }
5219
5220 return status;
5221}
5222
Tim Petersced69f82003-09-16 20:30:58 +00005223static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224int fixcapitalize(PyUnicodeObject *self)
5225{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005226 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005227 Py_UNICODE *s = self->str;
5228 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005229
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005230 if (len == 0)
5231 return 0;
5232 if (Py_UNICODE_ISLOWER(*s)) {
5233 *s = Py_UNICODE_TOUPPER(*s);
5234 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005236 s++;
5237 while (--len > 0) {
5238 if (Py_UNICODE_ISUPPER(*s)) {
5239 *s = Py_UNICODE_TOLOWER(*s);
5240 status = 1;
5241 }
5242 s++;
5243 }
5244 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245}
5246
5247static
5248int fixtitle(PyUnicodeObject *self)
5249{
5250 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5251 register Py_UNICODE *e;
5252 int previous_is_cased;
5253
5254 /* Shortcut for single character strings */
5255 if (PyUnicode_GET_SIZE(self) == 1) {
5256 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5257 if (*p != ch) {
5258 *p = ch;
5259 return 1;
5260 }
5261 else
5262 return 0;
5263 }
Tim Petersced69f82003-09-16 20:30:58 +00005264
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 e = p + PyUnicode_GET_SIZE(self);
5266 previous_is_cased = 0;
5267 for (; p < e; p++) {
5268 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005269
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 if (previous_is_cased)
5271 *p = Py_UNICODE_TOLOWER(ch);
5272 else
5273 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005274
5275 if (Py_UNICODE_ISLOWER(ch) ||
5276 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 Py_UNICODE_ISTITLE(ch))
5278 previous_is_cased = 1;
5279 else
5280 previous_is_cased = 0;
5281 }
5282 return 1;
5283}
5284
Tim Peters8ce9f162004-08-27 01:49:32 +00005285PyObject *
5286PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287{
Tim Peters8ce9f162004-08-27 01:49:32 +00005288 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005289 const Py_UNICODE blank = ' ';
5290 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005291 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005292 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005293 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5294 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005295 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5296 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005298 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005299 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300
Tim Peters05eba1f2004-08-27 21:32:02 +00005301 fseq = PySequence_Fast(seq, "");
5302 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005303 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005304 }
5305
Tim Peters91879ab2004-08-27 22:35:44 +00005306 /* Grrrr. A codec may be invoked to convert str objects to
5307 * Unicode, and so it's possible to call back into Python code
5308 * during PyUnicode_FromObject(), and so it's possible for a sick
5309 * codec to change the size of fseq (if seq is a list). Therefore
5310 * we have to keep refetching the size -- can't assume seqlen
5311 * is invariant.
5312 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005313 seqlen = PySequence_Fast_GET_SIZE(fseq);
5314 /* If empty sequence, return u"". */
5315 if (seqlen == 0) {
5316 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5317 goto Done;
5318 }
5319 /* If singleton sequence with an exact Unicode, return that. */
5320 if (seqlen == 1) {
5321 item = PySequence_Fast_GET_ITEM(fseq, 0);
5322 if (PyUnicode_CheckExact(item)) {
5323 Py_INCREF(item);
5324 res = (PyUnicodeObject *)item;
5325 goto Done;
5326 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005327 }
5328
Tim Peters05eba1f2004-08-27 21:32:02 +00005329 /* At least two items to join, or one that isn't exact Unicode. */
5330 if (seqlen > 1) {
5331 /* Set up sep and seplen -- they're needed. */
5332 if (separator == NULL) {
5333 sep = &blank;
5334 seplen = 1;
5335 }
5336 else {
5337 internal_separator = PyUnicode_FromObject(separator);
5338 if (internal_separator == NULL)
5339 goto onError;
5340 sep = PyUnicode_AS_UNICODE(internal_separator);
5341 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005342 /* In case PyUnicode_FromObject() mutated seq. */
5343 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005344 }
5345 }
5346
5347 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005348 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005349 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005350 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005351 res_p = PyUnicode_AS_UNICODE(res);
5352 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005353
Tim Peters05eba1f2004-08-27 21:32:02 +00005354 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005355 Py_ssize_t itemlen;
5356 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005357
5358 item = PySequence_Fast_GET_ITEM(fseq, i);
5359 /* Convert item to Unicode. */
5360 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5361 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005362 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005363 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005364 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005365 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005366 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005367 item = PyUnicode_FromObject(item);
5368 if (item == NULL)
5369 goto onError;
5370 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005371
Tim Peters91879ab2004-08-27 22:35:44 +00005372 /* In case PyUnicode_FromObject() mutated seq. */
5373 seqlen = PySequence_Fast_GET_SIZE(fseq);
5374
Tim Peters8ce9f162004-08-27 01:49:32 +00005375 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005377 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005378 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005379 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005380 if (i < seqlen - 1) {
5381 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005382 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005383 goto Overflow;
5384 }
5385 if (new_res_used > res_alloc) {
5386 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005387 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005388 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005389 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005390 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005391 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005392 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005393 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005395 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005396 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005398
5399 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005400 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005401 res_p += itemlen;
5402 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005403 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005404 res_p += seplen;
5405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005407 res_used = new_res_used;
5408 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005409
Tim Peters05eba1f2004-08-27 21:32:02 +00005410 /* Shrink res to match the used area; this probably can't fail,
5411 * but it's cheap to check.
5412 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005413 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005414 goto onError;
5415
5416 Done:
5417 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005418 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 return (PyObject *)res;
5420
Tim Peters8ce9f162004-08-27 01:49:32 +00005421 Overflow:
5422 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005423 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005424 Py_DECREF(item);
5425 /* fall through */
5426
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005428 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005429 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005430 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 return NULL;
5432}
5433
Tim Petersced69f82003-09-16 20:30:58 +00005434static
5435PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005436 Py_ssize_t left,
5437 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 Py_UNICODE fill)
5439{
5440 PyUnicodeObject *u;
5441
5442 if (left < 0)
5443 left = 0;
5444 if (right < 0)
5445 right = 0;
5446
Tim Peters7a29bd52001-09-12 03:03:31 +00005447 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 Py_INCREF(self);
5449 return self;
5450 }
5451
5452 u = _PyUnicode_New(left + self->length + right);
5453 if (u) {
5454 if (left)
5455 Py_UNICODE_FILL(u->str, fill, left);
5456 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5457 if (right)
5458 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5459 }
5460
5461 return u;
5462}
5463
5464#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005465 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 if (!str) \
5467 goto onError; \
5468 if (PyList_Append(list, str)) { \
5469 Py_DECREF(str); \
5470 goto onError; \
5471 } \
5472 else \
5473 Py_DECREF(str);
5474
5475static
5476PyObject *split_whitespace(PyUnicodeObject *self,
5477 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005478 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005480 register Py_ssize_t i;
5481 register Py_ssize_t j;
5482 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 PyObject *str;
5484
5485 for (i = j = 0; i < len; ) {
5486 /* find a token */
5487 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5488 i++;
5489 j = i;
5490 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5491 i++;
5492 if (j < i) {
5493 if (maxcount-- <= 0)
5494 break;
5495 SPLIT_APPEND(self->str, j, i);
5496 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5497 i++;
5498 j = i;
5499 }
5500 }
5501 if (j < len) {
5502 SPLIT_APPEND(self->str, j, len);
5503 }
5504 return list;
5505
5506 onError:
5507 Py_DECREF(list);
5508 return NULL;
5509}
5510
5511PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005512 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005514 register Py_ssize_t i;
5515 register Py_ssize_t j;
5516 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 PyObject *list;
5518 PyObject *str;
5519 Py_UNICODE *data;
5520
5521 string = PyUnicode_FromObject(string);
5522 if (string == NULL)
5523 return NULL;
5524 data = PyUnicode_AS_UNICODE(string);
5525 len = PyUnicode_GET_SIZE(string);
5526
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 list = PyList_New(0);
5528 if (!list)
5529 goto onError;
5530
5531 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005532 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005533
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005535 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537
5538 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005539 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 if (i < len) {
5541 if (data[i] == '\r' && i + 1 < len &&
5542 data[i+1] == '\n')
5543 i += 2;
5544 else
5545 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005546 if (keepends)
5547 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 }
Guido van Rossum86662912000-04-11 15:38:46 +00005549 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 j = i;
5551 }
5552 if (j < len) {
5553 SPLIT_APPEND(data, j, len);
5554 }
5555
5556 Py_DECREF(string);
5557 return list;
5558
5559 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005560 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561 Py_DECREF(string);
5562 return NULL;
5563}
5564
Tim Petersced69f82003-09-16 20:30:58 +00005565static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566PyObject *split_char(PyUnicodeObject *self,
5567 PyObject *list,
5568 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005569 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005571 register Py_ssize_t i;
5572 register Py_ssize_t j;
5573 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 PyObject *str;
5575
5576 for (i = j = 0; i < len; ) {
5577 if (self->str[i] == ch) {
5578 if (maxcount-- <= 0)
5579 break;
5580 SPLIT_APPEND(self->str, j, i);
5581 i = j = i + 1;
5582 } else
5583 i++;
5584 }
5585 if (j <= len) {
5586 SPLIT_APPEND(self->str, j, len);
5587 }
5588 return list;
5589
5590 onError:
5591 Py_DECREF(list);
5592 return NULL;
5593}
5594
Tim Petersced69f82003-09-16 20:30:58 +00005595static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596PyObject *split_substring(PyUnicodeObject *self,
5597 PyObject *list,
5598 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005599 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005601 register Py_ssize_t i;
5602 register Py_ssize_t j;
5603 Py_ssize_t len = self->length;
5604 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 PyObject *str;
5606
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005607 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 if (Py_UNICODE_MATCH(self, i, substring)) {
5609 if (maxcount-- <= 0)
5610 break;
5611 SPLIT_APPEND(self->str, j, i);
5612 i = j = i + sublen;
5613 } else
5614 i++;
5615 }
5616 if (j <= len) {
5617 SPLIT_APPEND(self->str, j, len);
5618 }
5619 return list;
5620
5621 onError:
5622 Py_DECREF(list);
5623 return NULL;
5624}
5625
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005626static
5627PyObject *rsplit_whitespace(PyUnicodeObject *self,
5628 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005629 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005630{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005631 register Py_ssize_t i;
5632 register Py_ssize_t j;
5633 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005634 PyObject *str;
5635
5636 for (i = j = len - 1; i >= 0; ) {
5637 /* find a token */
5638 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5639 i--;
5640 j = i;
5641 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5642 i--;
5643 if (j > i) {
5644 if (maxcount-- <= 0)
5645 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005646 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005647 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5648 i--;
5649 j = i;
5650 }
5651 }
5652 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005653 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005654 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005655 if (PyList_Reverse(list) < 0)
5656 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005657 return list;
5658
5659 onError:
5660 Py_DECREF(list);
5661 return NULL;
5662}
5663
5664static
5665PyObject *rsplit_char(PyUnicodeObject *self,
5666 PyObject *list,
5667 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005668 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005669{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005670 register Py_ssize_t i;
5671 register Py_ssize_t j;
5672 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005673 PyObject *str;
5674
5675 for (i = j = len - 1; i >= 0; ) {
5676 if (self->str[i] == ch) {
5677 if (maxcount-- <= 0)
5678 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005679 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005680 j = i = i - 1;
5681 } else
5682 i--;
5683 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005684 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005685 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005686 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005687 if (PyList_Reverse(list) < 0)
5688 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005689 return list;
5690
5691 onError:
5692 Py_DECREF(list);
5693 return NULL;
5694}
5695
5696static
5697PyObject *rsplit_substring(PyUnicodeObject *self,
5698 PyObject *list,
5699 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005700 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005701{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005702 register Py_ssize_t i;
5703 register Py_ssize_t j;
5704 Py_ssize_t len = self->length;
5705 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005706 PyObject *str;
5707
5708 for (i = len - sublen, j = len; i >= 0; ) {
5709 if (Py_UNICODE_MATCH(self, i, substring)) {
5710 if (maxcount-- <= 0)
5711 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005712 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005713 j = i;
5714 i -= sublen;
5715 } else
5716 i--;
5717 }
5718 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005719 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005720 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005721 if (PyList_Reverse(list) < 0)
5722 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005723 return list;
5724
5725 onError:
5726 Py_DECREF(list);
5727 return NULL;
5728}
5729
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730#undef SPLIT_APPEND
5731
5732static
5733PyObject *split(PyUnicodeObject *self,
5734 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005735 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736{
5737 PyObject *list;
5738
5739 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005740 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741
5742 list = PyList_New(0);
5743 if (!list)
5744 return NULL;
5745
5746 if (substring == NULL)
5747 return split_whitespace(self,list,maxcount);
5748
5749 else if (substring->length == 1)
5750 return split_char(self,list,substring->str[0],maxcount);
5751
5752 else if (substring->length == 0) {
5753 Py_DECREF(list);
5754 PyErr_SetString(PyExc_ValueError, "empty separator");
5755 return NULL;
5756 }
5757 else
5758 return split_substring(self,list,substring,maxcount);
5759}
5760
Tim Petersced69f82003-09-16 20:30:58 +00005761static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005762PyObject *rsplit(PyUnicodeObject *self,
5763 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005764 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005765{
5766 PyObject *list;
5767
5768 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005769 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005770
5771 list = PyList_New(0);
5772 if (!list)
5773 return NULL;
5774
5775 if (substring == NULL)
5776 return rsplit_whitespace(self,list,maxcount);
5777
5778 else if (substring->length == 1)
5779 return rsplit_char(self,list,substring->str[0],maxcount);
5780
5781 else if (substring->length == 0) {
5782 Py_DECREF(list);
5783 PyErr_SetString(PyExc_ValueError, "empty separator");
5784 return NULL;
5785 }
5786 else
5787 return rsplit_substring(self,list,substring,maxcount);
5788}
5789
5790static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791PyObject *replace(PyUnicodeObject *self,
5792 PyUnicodeObject *str1,
5793 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005794 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795{
5796 PyUnicodeObject *u;
5797
5798 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005799 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
Thomas Wouters477c8d52006-05-27 19:21:47 +00005801 if (str1->length == str2->length) {
5802 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005803 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005804 if (str1->length == 1) {
5805 /* replace characters */
5806 Py_UNICODE u1, u2;
5807 if (!findchar(self->str, self->length, str1->str[0]))
5808 goto nothing;
5809 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5810 if (!u)
5811 return NULL;
5812 Py_UNICODE_COPY(u->str, self->str, self->length);
5813 u1 = str1->str[0];
5814 u2 = str2->str[0];
5815 for (i = 0; i < u->length; i++)
5816 if (u->str[i] == u1) {
5817 if (--maxcount < 0)
5818 break;
5819 u->str[i] = u2;
5820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005822 i = fastsearch(
5823 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005825 if (i < 0)
5826 goto nothing;
5827 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5828 if (!u)
5829 return NULL;
5830 Py_UNICODE_COPY(u->str, self->str, self->length);
5831 while (i <= self->length - str1->length)
5832 if (Py_UNICODE_MATCH(self, i, str1)) {
5833 if (--maxcount < 0)
5834 break;
5835 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5836 i += str1->length;
5837 } else
5838 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005841
5842 Py_ssize_t n, i, j, e;
5843 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 Py_UNICODE *p;
5845
5846 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005847 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 if (n > maxcount)
5849 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005850 if (n == 0)
5851 goto nothing;
5852 /* new_size = self->length + n * (str2->length - str1->length)); */
5853 delta = (str2->length - str1->length);
5854 if (delta == 0) {
5855 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005857 product = n * (str2->length - str1->length);
5858 if ((product / (str2->length - str1->length)) != n) {
5859 PyErr_SetString(PyExc_OverflowError,
5860 "replace string is too long");
5861 return NULL;
5862 }
5863 new_size = self->length + product;
5864 if (new_size < 0) {
5865 PyErr_SetString(PyExc_OverflowError,
5866 "replace string is too long");
5867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 }
5869 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005870 u = _PyUnicode_New(new_size);
5871 if (!u)
5872 return NULL;
5873 i = 0;
5874 p = u->str;
5875 e = self->length - str1->length;
5876 if (str1->length > 0) {
5877 while (n-- > 0) {
5878 /* look for next match */
5879 j = i;
5880 while (j <= e) {
5881 if (Py_UNICODE_MATCH(self, j, str1))
5882 break;
5883 j++;
5884 }
5885 if (j > i) {
5886 if (j > e)
5887 break;
5888 /* copy unchanged part [i:j] */
5889 Py_UNICODE_COPY(p, self->str+i, j-i);
5890 p += j - i;
5891 }
5892 /* copy substitution string */
5893 if (str2->length > 0) {
5894 Py_UNICODE_COPY(p, str2->str, str2->length);
5895 p += str2->length;
5896 }
5897 i = j + str1->length;
5898 }
5899 if (i < self->length)
5900 /* copy tail [i:] */
5901 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5902 } else {
5903 /* interleave */
5904 while (n > 0) {
5905 Py_UNICODE_COPY(p, str2->str, str2->length);
5906 p += str2->length;
5907 if (--n <= 0)
5908 break;
5909 *p++ = self->str[i++];
5910 }
5911 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5912 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005915
5916nothing:
5917 /* nothing to replace; return original string (when possible) */
5918 if (PyUnicode_CheckExact(self)) {
5919 Py_INCREF(self);
5920 return (PyObject *) self;
5921 }
5922 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923}
5924
5925/* --- Unicode Object Methods --------------------------------------------- */
5926
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005927PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928"S.title() -> unicode\n\
5929\n\
5930Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005931characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932
5933static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005934unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 return fixup(self, fixtitle);
5937}
5938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005939PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940"S.capitalize() -> unicode\n\
5941\n\
5942Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005943have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944
5945static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005946unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 return fixup(self, fixcapitalize);
5949}
5950
5951#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005952PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953"S.capwords() -> unicode\n\
5954\n\
5955Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005956normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957
5958static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005959unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960{
5961 PyObject *list;
5962 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005963 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 /* Split into words */
5966 list = split(self, NULL, -1);
5967 if (!list)
5968 return NULL;
5969
5970 /* Capitalize each word */
5971 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5972 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5973 fixcapitalize);
5974 if (item == NULL)
5975 goto onError;
5976 Py_DECREF(PyList_GET_ITEM(list, i));
5977 PyList_SET_ITEM(list, i, item);
5978 }
5979
5980 /* Join the words to form a new string */
5981 item = PyUnicode_Join(NULL, list);
5982
5983onError:
5984 Py_DECREF(list);
5985 return (PyObject *)item;
5986}
5987#endif
5988
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005989/* Argument converter. Coerces to a single unicode character */
5990
5991static int
5992convert_uc(PyObject *obj, void *addr)
5993{
5994 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5995 PyObject *uniobj;
5996 Py_UNICODE *unistr;
5997
5998 uniobj = PyUnicode_FromObject(obj);
5999 if (uniobj == NULL) {
6000 PyErr_SetString(PyExc_TypeError,
6001 "The fill character cannot be converted to Unicode");
6002 return 0;
6003 }
6004 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6005 PyErr_SetString(PyExc_TypeError,
6006 "The fill character must be exactly one character long");
6007 Py_DECREF(uniobj);
6008 return 0;
6009 }
6010 unistr = PyUnicode_AS_UNICODE(uniobj);
6011 *fillcharloc = unistr[0];
6012 Py_DECREF(uniobj);
6013 return 1;
6014}
6015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006016PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006017"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006019Return S centered in a Unicode string of length width. Padding is\n\
6020done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021
6022static PyObject *
6023unicode_center(PyUnicodeObject *self, PyObject *args)
6024{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006025 Py_ssize_t marg, left;
6026 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006027 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028
Thomas Woutersde017742006-02-16 19:34:37 +00006029 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 return NULL;
6031
Tim Peters7a29bd52001-09-12 03:03:31 +00006032 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 Py_INCREF(self);
6034 return (PyObject*) self;
6035 }
6036
6037 marg = width - self->length;
6038 left = marg / 2 + (marg & width & 1);
6039
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006040 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041}
6042
Marc-André Lemburge5034372000-08-08 08:04:29 +00006043#if 0
6044
6045/* This code should go into some future Unicode collation support
6046 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006047 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006048
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006049/* speedy UTF-16 code point order comparison */
6050/* gleaned from: */
6051/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6052
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006053static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006054{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006055 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006056 0, 0, 0, 0, 0, 0, 0, 0,
6057 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006058 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006059};
6060
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061static int
6062unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6063{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006064 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006065
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 Py_UNICODE *s1 = str1->str;
6067 Py_UNICODE *s2 = str2->str;
6068
6069 len1 = str1->length;
6070 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006071
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006073 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006074
6075 c1 = *s1++;
6076 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006077
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006078 if (c1 > (1<<11) * 26)
6079 c1 += utf16Fixup[c1>>11];
6080 if (c2 > (1<<11) * 26)
6081 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006082 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006083
6084 if (c1 != c2)
6085 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006086
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006087 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 }
6089
6090 return (len1 < len2) ? -1 : (len1 != len2);
6091}
6092
Marc-André Lemburge5034372000-08-08 08:04:29 +00006093#else
6094
6095static int
6096unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6097{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006098 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006099
6100 Py_UNICODE *s1 = str1->str;
6101 Py_UNICODE *s2 = str2->str;
6102
6103 len1 = str1->length;
6104 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006105
Marc-André Lemburge5034372000-08-08 08:04:29 +00006106 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006107 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006108
Fredrik Lundh45714e92001-06-26 16:39:36 +00006109 c1 = *s1++;
6110 c2 = *s2++;
6111
6112 if (c1 != c2)
6113 return (c1 < c2) ? -1 : 1;
6114
Marc-André Lemburge5034372000-08-08 08:04:29 +00006115 len1--; len2--;
6116 }
6117
6118 return (len1 < len2) ? -1 : (len1 != len2);
6119}
6120
6121#endif
6122
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123int PyUnicode_Compare(PyObject *left,
6124 PyObject *right)
6125{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006126 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6127 return unicode_compare((PyUnicodeObject *)left,
6128 (PyUnicodeObject *)right);
6129 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
6130 (PyUnicode_Check(left) && PyString_Check(right))) {
6131 if (PyUnicode_Check(left))
6132 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
6133 if (PyUnicode_Check(right))
6134 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
6135 assert(PyString_Check(left));
6136 assert(PyString_Check(right));
6137 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006139 PyErr_Format(PyExc_TypeError,
6140 "Can't compare %.100s and %.100s",
6141 left->ob_type->tp_name,
6142 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 return -1;
6144}
6145
Martin v. Löwis5b222132007-06-10 09:51:05 +00006146int
6147PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6148{
6149 int i;
6150 Py_UNICODE *id;
6151 assert(PyUnicode_Check(uni));
6152 id = PyUnicode_AS_UNICODE(uni);
6153 /* Compare Unicode string and source character set string */
6154 for (i = 0; id[i] && str[i]; i++)
6155 if (id[i] != str[i])
6156 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6157 if (id[i])
6158 return 1; /* uni is longer */
6159 if (str[i])
6160 return -1; /* str is longer */
6161 return 0;
6162}
6163
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006164PyObject *PyUnicode_RichCompare(PyObject *left,
6165 PyObject *right,
6166 int op)
6167{
6168 int result;
6169
6170 result = PyUnicode_Compare(left, right);
6171 if (result == -1 && PyErr_Occurred())
6172 goto onError;
6173
6174 /* Convert the return value to a Boolean */
6175 switch (op) {
6176 case Py_EQ:
6177 result = (result == 0);
6178 break;
6179 case Py_NE:
6180 result = (result != 0);
6181 break;
6182 case Py_LE:
6183 result = (result <= 0);
6184 break;
6185 case Py_GE:
6186 result = (result >= 0);
6187 break;
6188 case Py_LT:
6189 result = (result == -1);
6190 break;
6191 case Py_GT:
6192 result = (result == 1);
6193 break;
6194 }
6195 return PyBool_FromLong(result);
6196
6197 onError:
6198
6199 /* Standard case
6200
6201 Type errors mean that PyUnicode_FromObject() could not convert
6202 one of the arguments (usually the right hand side) to Unicode,
6203 ie. we can't handle the comparison request. However, it is
6204 possible that the other object knows a comparison method, which
6205 is why we return Py_NotImplemented to give the other object a
6206 chance.
6207
6208 */
6209 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6210 PyErr_Clear();
6211 Py_INCREF(Py_NotImplemented);
6212 return Py_NotImplemented;
6213 }
6214 if (op != Py_EQ && op != Py_NE)
6215 return NULL;
6216
6217 /* Equality comparison.
6218
6219 This is a special case: we silence any PyExc_UnicodeDecodeError
6220 and instead turn it into a PyErr_UnicodeWarning.
6221
6222 */
6223 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6224 return NULL;
6225 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006226 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6227 (op == Py_EQ) ?
6228 "Unicode equal comparison "
6229 "failed to convert both arguments to Unicode - "
6230 "interpreting them as being unequal"
6231 :
6232 "Unicode unequal comparison "
6233 "failed to convert both arguments to Unicode - "
6234 "interpreting them as being unequal",
6235 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006236 return NULL;
6237 result = (op == Py_NE);
6238 return PyBool_FromLong(result);
6239}
6240
Guido van Rossum403d68b2000-03-13 15:55:09 +00006241int PyUnicode_Contains(PyObject *container,
6242 PyObject *element)
6243{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006244 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006245 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006246
6247 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006248 sub = PyUnicode_FromObject(element);
6249 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006250 PyErr_Format(PyExc_TypeError,
6251 "'in <string>' requires string as left operand, not %s",
6252 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006253 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006254 }
6255
Thomas Wouters477c8d52006-05-27 19:21:47 +00006256 str = PyUnicode_FromObject(container);
6257 if (!str) {
6258 Py_DECREF(sub);
6259 return -1;
6260 }
6261
6262 result = stringlib_contains_obj(str, sub);
6263
6264 Py_DECREF(str);
6265 Py_DECREF(sub);
6266
Guido van Rossum403d68b2000-03-13 15:55:09 +00006267 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006268}
6269
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270/* Concat to string or Unicode object giving a new Unicode object. */
6271
6272PyObject *PyUnicode_Concat(PyObject *left,
6273 PyObject *right)
6274{
6275 PyUnicodeObject *u = NULL, *v = NULL, *w;
6276
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006277 if (PyBytes_Check(left) || PyBytes_Check(right))
6278 return PyBytes_Concat(left, right);
6279
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 /* Coerce the two arguments */
6281 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6282 if (u == NULL)
6283 goto onError;
6284 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6285 if (v == NULL)
6286 goto onError;
6287
6288 /* Shortcuts */
6289 if (v == unicode_empty) {
6290 Py_DECREF(v);
6291 return (PyObject *)u;
6292 }
6293 if (u == unicode_empty) {
6294 Py_DECREF(u);
6295 return (PyObject *)v;
6296 }
6297
6298 /* Concat the two Unicode strings */
6299 w = _PyUnicode_New(u->length + v->length);
6300 if (w == NULL)
6301 goto onError;
6302 Py_UNICODE_COPY(w->str, u->str, u->length);
6303 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6304
6305 Py_DECREF(u);
6306 Py_DECREF(v);
6307 return (PyObject *)w;
6308
6309onError:
6310 Py_XDECREF(u);
6311 Py_XDECREF(v);
6312 return NULL;
6313}
6314
Walter Dörwald1ab83302007-05-18 17:15:44 +00006315void
6316PyUnicode_Append(PyObject **pleft, PyObject *right)
6317{
6318 PyObject *new;
6319 if (*pleft == NULL)
6320 return;
6321 if (right == NULL || !PyUnicode_Check(*pleft)) {
6322 Py_DECREF(*pleft);
6323 *pleft = NULL;
6324 return;
6325 }
6326 new = PyUnicode_Concat(*pleft, right);
6327 Py_DECREF(*pleft);
6328 *pleft = new;
6329}
6330
6331void
6332PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6333{
6334 PyUnicode_Append(pleft, right);
6335 Py_XDECREF(right);
6336}
6337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006338PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339"S.count(sub[, start[, end]]) -> int\n\
6340\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006341Return the number of non-overlapping occurrences of substring sub in\n\
6342Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006343interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344
6345static PyObject *
6346unicode_count(PyUnicodeObject *self, PyObject *args)
6347{
6348 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006349 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006350 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 PyObject *result;
6352
Guido van Rossumb8872e62000-05-09 14:14:27 +00006353 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6354 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355 return NULL;
6356
6357 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006358 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 if (substring == NULL)
6360 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006361
Thomas Wouters477c8d52006-05-27 19:21:47 +00006362 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363
Thomas Wouters477c8d52006-05-27 19:21:47 +00006364 result = PyInt_FromSsize_t(
6365 stringlib_count(self->str + start, end - start,
6366 substring->str, substring->length)
6367 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368
6369 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006370
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 return result;
6372}
6373
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006374PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006375"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006377Encodes S using the codec registered for encoding. encoding defaults\n\
6378to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006379handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006380a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6381'xmlcharrefreplace' as well as any other name registered with\n\
6382codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383
6384static PyObject *
6385unicode_encode(PyUnicodeObject *self, PyObject *args)
6386{
6387 char *encoding = NULL;
6388 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006389 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006390
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6392 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006393 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006394 if (v == NULL)
6395 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006396 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006397 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006398 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006399 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006400 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006401 Py_DECREF(v);
6402 return NULL;
6403 }
6404 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006405
6406 onError:
6407 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006408}
6409
6410PyDoc_STRVAR(decode__doc__,
6411"S.decode([encoding[,errors]]) -> string or unicode\n\
6412\n\
6413Decodes S using the codec registered for encoding. encoding defaults\n\
6414to the default encoding. errors may be given to set a different error\n\
6415handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6416a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6417as well as any other name registerd with codecs.register_error that is\n\
6418able to handle UnicodeDecodeErrors.");
6419
6420static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006421unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006422{
Guido van Rossuma74184e2007-08-29 04:05:57 +00006423 PyErr_Format(PyExc_TypeError, "decoding str is not supported");
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006424 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425}
6426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006427PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428"S.expandtabs([tabsize]) -> unicode\n\
6429\n\
6430Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006431If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432
6433static PyObject*
6434unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6435{
6436 Py_UNICODE *e;
6437 Py_UNICODE *p;
6438 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006439 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 PyUnicodeObject *u;
6441 int tabsize = 8;
6442
6443 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6444 return NULL;
6445
Thomas Wouters7e474022000-07-16 12:04:32 +00006446 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006447 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 e = self->str + self->length;
6449 for (p = self->str; p < e; p++)
6450 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006451 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006453 if (old_j > j) {
6454 PyErr_SetString(PyExc_OverflowError,
6455 "new string is too long");
6456 return NULL;
6457 }
6458 old_j = j;
6459 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 }
6461 else {
6462 j++;
6463 if (*p == '\n' || *p == '\r') {
6464 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006465 old_j = j = 0;
6466 if (i < 0) {
6467 PyErr_SetString(PyExc_OverflowError,
6468 "new string is too long");
6469 return NULL;
6470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 }
6472 }
6473
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006474 if ((i + j) < 0) {
6475 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6476 return NULL;
6477 }
6478
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 /* Second pass: create output string and fill it */
6480 u = _PyUnicode_New(i + j);
6481 if (!u)
6482 return NULL;
6483
6484 j = 0;
6485 q = u->str;
6486
6487 for (p = self->str; p < e; p++)
6488 if (*p == '\t') {
6489 if (tabsize > 0) {
6490 i = tabsize - (j % tabsize);
6491 j += i;
6492 while (i--)
6493 *q++ = ' ';
6494 }
6495 }
6496 else {
6497 j++;
6498 *q++ = *p;
6499 if (*p == '\n' || *p == '\r')
6500 j = 0;
6501 }
6502
6503 return (PyObject*) u;
6504}
6505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006506PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507"S.find(sub [,start [,end]]) -> int\n\
6508\n\
6509Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006510such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511arguments start and end are interpreted as in slice notation.\n\
6512\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006513Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514
6515static PyObject *
6516unicode_find(PyUnicodeObject *self, PyObject *args)
6517{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006518 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006519 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006520 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006521 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522
Guido van Rossumb8872e62000-05-09 14:14:27 +00006523 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6524 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006526 substring = PyUnicode_FromObject(substring);
6527 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 return NULL;
6529
Thomas Wouters477c8d52006-05-27 19:21:47 +00006530 result = stringlib_find_slice(
6531 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6532 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6533 start, end
6534 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535
6536 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006537
6538 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539}
6540
6541static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006542unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543{
6544 if (index < 0 || index >= self->length) {
6545 PyErr_SetString(PyExc_IndexError, "string index out of range");
6546 return NULL;
6547 }
6548
6549 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6550}
6551
6552static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006553unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006555 /* Since Unicode objects compare equal to their UTF-8 string
6556 counterparts, we hash the UTF-8 string. */
6557 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6558 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559}
6560
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006561PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562"S.index(sub [,start [,end]]) -> int\n\
6563\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006564Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565
6566static PyObject *
6567unicode_index(PyUnicodeObject *self, PyObject *args)
6568{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006569 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006570 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006571 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006572 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573
Guido van Rossumb8872e62000-05-09 14:14:27 +00006574 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6575 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006577 substring = PyUnicode_FromObject(substring);
6578 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 return NULL;
6580
Thomas Wouters477c8d52006-05-27 19:21:47 +00006581 result = stringlib_find_slice(
6582 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6583 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6584 start, end
6585 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586
6587 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006588
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 if (result < 0) {
6590 PyErr_SetString(PyExc_ValueError, "substring not found");
6591 return NULL;
6592 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006593
Martin v. Löwis18e16552006-02-15 17:27:45 +00006594 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595}
6596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006597PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006598"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006600Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006601at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602
6603static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006604unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605{
6606 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6607 register const Py_UNICODE *e;
6608 int cased;
6609
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 /* Shortcut for single character strings */
6611 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006612 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006614 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006615 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006616 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006617
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 e = p + PyUnicode_GET_SIZE(self);
6619 cased = 0;
6620 for (; p < e; p++) {
6621 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006622
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006624 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 else if (!cased && Py_UNICODE_ISLOWER(ch))
6626 cased = 1;
6627 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006628 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629}
6630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006631PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006632"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006634Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006635at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636
6637static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006638unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639{
6640 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6641 register const Py_UNICODE *e;
6642 int cased;
6643
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 /* Shortcut for single character strings */
6645 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006646 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006648 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006649 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006650 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006651
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 e = p + PyUnicode_GET_SIZE(self);
6653 cased = 0;
6654 for (; p < e; p++) {
6655 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006656
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006658 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 else if (!cased && Py_UNICODE_ISUPPER(ch))
6660 cased = 1;
6661 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006662 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663}
6664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006665PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006666"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006668Return True if S is a titlecased string and there is at least one\n\
6669character in S, i.e. upper- and titlecase characters may only\n\
6670follow uncased characters and lowercase characters only cased ones.\n\
6671Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672
6673static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006674unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675{
6676 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6677 register const Py_UNICODE *e;
6678 int cased, previous_is_cased;
6679
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 /* Shortcut for single character strings */
6681 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006682 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6683 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006685 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006686 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006687 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006688
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 e = p + PyUnicode_GET_SIZE(self);
6690 cased = 0;
6691 previous_is_cased = 0;
6692 for (; p < e; p++) {
6693 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006694
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6696 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006697 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 previous_is_cased = 1;
6699 cased = 1;
6700 }
6701 else if (Py_UNICODE_ISLOWER(ch)) {
6702 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006703 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 previous_is_cased = 1;
6705 cased = 1;
6706 }
6707 else
6708 previous_is_cased = 0;
6709 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006710 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711}
6712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006713PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006714"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006716Return True if all characters in S are whitespace\n\
6717and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718
6719static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006720unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721{
6722 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6723 register const Py_UNICODE *e;
6724
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 /* Shortcut for single character strings */
6726 if (PyUnicode_GET_SIZE(self) == 1 &&
6727 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006728 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006730 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006731 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006732 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 e = p + PyUnicode_GET_SIZE(self);
6735 for (; p < e; p++) {
6736 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006737 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006739 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740}
6741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006742PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006743"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006744\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006745Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006746and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006747
6748static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006749unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006750{
6751 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6752 register const Py_UNICODE *e;
6753
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006754 /* Shortcut for single character strings */
6755 if (PyUnicode_GET_SIZE(self) == 1 &&
6756 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006757 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006758
6759 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006760 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006761 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006762
6763 e = p + PyUnicode_GET_SIZE(self);
6764 for (; p < e; p++) {
6765 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006766 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006767 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006768 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006769}
6770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006771PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006772"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006773\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006774Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006775and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006776
6777static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006778unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006779{
6780 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6781 register const Py_UNICODE *e;
6782
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006783 /* Shortcut for single character strings */
6784 if (PyUnicode_GET_SIZE(self) == 1 &&
6785 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006786 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006787
6788 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006789 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006790 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006791
6792 e = p + PyUnicode_GET_SIZE(self);
6793 for (; p < e; p++) {
6794 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006795 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006796 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006797 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006798}
6799
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006800PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006801"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006803Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006804False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805
6806static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006807unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808{
6809 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6810 register const Py_UNICODE *e;
6811
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 /* Shortcut for single character strings */
6813 if (PyUnicode_GET_SIZE(self) == 1 &&
6814 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006815 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006817 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006818 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006819 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006820
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 e = p + PyUnicode_GET_SIZE(self);
6822 for (; p < e; p++) {
6823 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006824 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006826 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827}
6828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006829PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006830"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006832Return True if all characters in S are digits\n\
6833and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834
6835static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006836unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837{
6838 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6839 register const Py_UNICODE *e;
6840
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 /* Shortcut for single character strings */
6842 if (PyUnicode_GET_SIZE(self) == 1 &&
6843 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006844 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006846 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006847 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006848 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006849
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850 e = p + PyUnicode_GET_SIZE(self);
6851 for (; p < e; p++) {
6852 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006853 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006855 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856}
6857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006858PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006859"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006861Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006862False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863
6864static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006865unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866{
6867 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6868 register const Py_UNICODE *e;
6869
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 /* Shortcut for single character strings */
6871 if (PyUnicode_GET_SIZE(self) == 1 &&
6872 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006873 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006875 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006876 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006877 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006878
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 e = p + PyUnicode_GET_SIZE(self);
6880 for (; p < e; p++) {
6881 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006882 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006884 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885}
6886
Martin v. Löwis47383402007-08-15 07:32:56 +00006887int
6888PyUnicode_IsIdentifier(PyObject *self)
6889{
6890 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6891 register const Py_UNICODE *e;
6892
6893 /* Special case for empty strings */
6894 if (PyUnicode_GET_SIZE(self) == 0)
6895 return 0;
6896
6897 /* PEP 3131 says that the first character must be in
6898 XID_Start and subsequent characters in XID_Continue,
6899 and for the ASCII range, the 2.x rules apply (i.e
6900 start with letters and underscore, continue with
6901 letters, digits, underscore). However, given the current
6902 definition of XID_Start and XID_Continue, it is sufficient
6903 to check just for these, except that _ must be allowed
6904 as starting an identifier. */
6905 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6906 return 0;
6907
6908 e = p + PyUnicode_GET_SIZE(self);
6909 for (p++; p < e; p++) {
6910 if (!_PyUnicode_IsXidContinue(*p))
6911 return 0;
6912 }
6913 return 1;
6914}
6915
6916PyDoc_STRVAR(isidentifier__doc__,
6917"S.isidentifier() -> bool\n\
6918\n\
6919Return True if S is a valid identifier according\n\
6920to the language definition.");
6921
6922static PyObject*
6923unicode_isidentifier(PyObject *self)
6924{
6925 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6926}
6927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006928PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929"S.join(sequence) -> unicode\n\
6930\n\
6931Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006932sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933
6934static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006935unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006937 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938}
6939
Martin v. Löwis18e16552006-02-15 17:27:45 +00006940static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941unicode_length(PyUnicodeObject *self)
6942{
6943 return self->length;
6944}
6945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006946PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006947"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948\n\
6949Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006950done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951
6952static PyObject *
6953unicode_ljust(PyUnicodeObject *self, PyObject *args)
6954{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006955 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006956 Py_UNICODE fillchar = ' ';
6957
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006958 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 return NULL;
6960
Tim Peters7a29bd52001-09-12 03:03:31 +00006961 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 Py_INCREF(self);
6963 return (PyObject*) self;
6964 }
6965
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006966 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967}
6968
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006969PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970"S.lower() -> unicode\n\
6971\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006972Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973
6974static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006975unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 return fixup(self, fixlower);
6978}
6979
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006980#define LEFTSTRIP 0
6981#define RIGHTSTRIP 1
6982#define BOTHSTRIP 2
6983
6984/* Arrays indexed by above */
6985static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6986
6987#define STRIPNAME(i) (stripformat[i]+3)
6988
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006989/* externally visible for str.strip(unicode) */
6990PyObject *
6991_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6992{
6993 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006994 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006995 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006996 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6997 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006998
Thomas Wouters477c8d52006-05-27 19:21:47 +00006999 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7000
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007001 i = 0;
7002 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007003 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7004 i++;
7005 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007006 }
7007
7008 j = len;
7009 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007010 do {
7011 j--;
7012 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7013 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007014 }
7015
7016 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007017 Py_INCREF(self);
7018 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007019 }
7020 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007021 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007022}
7023
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024
7025static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007026do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007028 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007029 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007030
7031 i = 0;
7032 if (striptype != RIGHTSTRIP) {
7033 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7034 i++;
7035 }
7036 }
7037
7038 j = len;
7039 if (striptype != LEFTSTRIP) {
7040 do {
7041 j--;
7042 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7043 j++;
7044 }
7045
7046 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7047 Py_INCREF(self);
7048 return (PyObject*)self;
7049 }
7050 else
7051 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052}
7053
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007054
7055static PyObject *
7056do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7057{
7058 PyObject *sep = NULL;
7059
7060 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7061 return NULL;
7062
7063 if (sep != NULL && sep != Py_None) {
7064 if (PyUnicode_Check(sep))
7065 return _PyUnicode_XStrip(self, striptype, sep);
7066 else if (PyString_Check(sep)) {
7067 PyObject *res;
7068 sep = PyUnicode_FromObject(sep);
7069 if (sep==NULL)
7070 return NULL;
7071 res = _PyUnicode_XStrip(self, striptype, sep);
7072 Py_DECREF(sep);
7073 return res;
7074 }
7075 else {
7076 PyErr_Format(PyExc_TypeError,
7077 "%s arg must be None, unicode or str",
7078 STRIPNAME(striptype));
7079 return NULL;
7080 }
7081 }
7082
7083 return do_strip(self, striptype);
7084}
7085
7086
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007087PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007088"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007089\n\
7090Return a copy of the string S with leading and trailing\n\
7091whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007092If chars is given and not None, remove characters in chars instead.\n\
7093If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007094
7095static PyObject *
7096unicode_strip(PyUnicodeObject *self, PyObject *args)
7097{
7098 if (PyTuple_GET_SIZE(args) == 0)
7099 return do_strip(self, BOTHSTRIP); /* Common case */
7100 else
7101 return do_argstrip(self, BOTHSTRIP, args);
7102}
7103
7104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007105PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007106"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007107\n\
7108Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007109If chars is given and not None, remove characters in chars instead.\n\
7110If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007111
7112static PyObject *
7113unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7114{
7115 if (PyTuple_GET_SIZE(args) == 0)
7116 return do_strip(self, LEFTSTRIP); /* Common case */
7117 else
7118 return do_argstrip(self, LEFTSTRIP, args);
7119}
7120
7121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007122PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007123"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007124\n\
7125Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007126If chars is given and not None, remove characters in chars instead.\n\
7127If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007128
7129static PyObject *
7130unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7131{
7132 if (PyTuple_GET_SIZE(args) == 0)
7133 return do_strip(self, RIGHTSTRIP); /* Common case */
7134 else
7135 return do_argstrip(self, RIGHTSTRIP, args);
7136}
7137
7138
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007140unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141{
7142 PyUnicodeObject *u;
7143 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007144 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007145 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146
7147 if (len < 0)
7148 len = 0;
7149
Tim Peters7a29bd52001-09-12 03:03:31 +00007150 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151 /* no repeat, return original string */
7152 Py_INCREF(str);
7153 return (PyObject*) str;
7154 }
Tim Peters8f422462000-09-09 06:13:41 +00007155
7156 /* ensure # of chars needed doesn't overflow int and # of bytes
7157 * needed doesn't overflow size_t
7158 */
7159 nchars = len * str->length;
7160 if (len && nchars / len != str->length) {
7161 PyErr_SetString(PyExc_OverflowError,
7162 "repeated string is too long");
7163 return NULL;
7164 }
7165 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7166 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7167 PyErr_SetString(PyExc_OverflowError,
7168 "repeated string is too long");
7169 return NULL;
7170 }
7171 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172 if (!u)
7173 return NULL;
7174
7175 p = u->str;
7176
Thomas Wouters477c8d52006-05-27 19:21:47 +00007177 if (str->length == 1 && len > 0) {
7178 Py_UNICODE_FILL(p, str->str[0], len);
7179 } else {
7180 Py_ssize_t done = 0; /* number of characters copied this far */
7181 if (done < nchars) {
7182 Py_UNICODE_COPY(p, str->str, str->length);
7183 done = str->length;
7184 }
7185 while (done < nchars) {
7186 int n = (done <= nchars-done) ? done : nchars-done;
7187 Py_UNICODE_COPY(p+done, p, n);
7188 done += n;
7189 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 }
7191
7192 return (PyObject*) u;
7193}
7194
7195PyObject *PyUnicode_Replace(PyObject *obj,
7196 PyObject *subobj,
7197 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007198 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199{
7200 PyObject *self;
7201 PyObject *str1;
7202 PyObject *str2;
7203 PyObject *result;
7204
7205 self = PyUnicode_FromObject(obj);
7206 if (self == NULL)
7207 return NULL;
7208 str1 = PyUnicode_FromObject(subobj);
7209 if (str1 == NULL) {
7210 Py_DECREF(self);
7211 return NULL;
7212 }
7213 str2 = PyUnicode_FromObject(replobj);
7214 if (str2 == NULL) {
7215 Py_DECREF(self);
7216 Py_DECREF(str1);
7217 return NULL;
7218 }
Tim Petersced69f82003-09-16 20:30:58 +00007219 result = replace((PyUnicodeObject *)self,
7220 (PyUnicodeObject *)str1,
7221 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 maxcount);
7223 Py_DECREF(self);
7224 Py_DECREF(str1);
7225 Py_DECREF(str2);
7226 return result;
7227}
7228
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007229PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230"S.replace (old, new[, maxsplit]) -> unicode\n\
7231\n\
7232Return a copy of S with all occurrences of substring\n\
7233old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007234given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235
7236static PyObject*
7237unicode_replace(PyUnicodeObject *self, PyObject *args)
7238{
7239 PyUnicodeObject *str1;
7240 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007241 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 PyObject *result;
7243
Martin v. Löwis18e16552006-02-15 17:27:45 +00007244 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 return NULL;
7246 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7247 if (str1 == NULL)
7248 return NULL;
7249 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007250 if (str2 == NULL) {
7251 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007253 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254
7255 result = replace(self, str1, str2, maxcount);
7256
7257 Py_DECREF(str1);
7258 Py_DECREF(str2);
7259 return result;
7260}
7261
7262static
7263PyObject *unicode_repr(PyObject *unicode)
7264{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007265 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007266 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007267 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7268 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7269
7270 /* XXX(nnorwitz): rather than over-allocating, it would be
7271 better to choose a different scheme. Perhaps scan the
7272 first N-chars of the string and allocate based on that size.
7273 */
7274 /* Initial allocation is based on the longest-possible unichr
7275 escape.
7276
7277 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7278 unichr, so in this case it's the longest unichr escape. In
7279 narrow (UTF-16) builds this is five chars per source unichr
7280 since there are two unichrs in the surrogate pair, so in narrow
7281 (UTF-16) builds it's not the longest unichr escape.
7282
7283 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7284 so in the narrow (UTF-16) build case it's the longest unichr
7285 escape.
7286 */
7287
Walter Dörwald1ab83302007-05-18 17:15:44 +00007288 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007289 2 /* quotes */
7290#ifdef Py_UNICODE_WIDE
7291 + 10*size
7292#else
7293 + 6*size
7294#endif
7295 + 1);
7296 if (repr == NULL)
7297 return NULL;
7298
Walter Dörwald1ab83302007-05-18 17:15:44 +00007299 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007300
7301 /* Add quote */
7302 *p++ = (findchar(s, size, '\'') &&
7303 !findchar(s, size, '"')) ? '"' : '\'';
7304 while (size-- > 0) {
7305 Py_UNICODE ch = *s++;
7306
7307 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007308 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007309 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007310 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007311 continue;
7312 }
7313
7314#ifdef Py_UNICODE_WIDE
7315 /* Map 21-bit characters to '\U00xxxxxx' */
7316 else if (ch >= 0x10000) {
7317 *p++ = '\\';
7318 *p++ = 'U';
7319 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7320 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7321 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7322 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7323 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7324 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7325 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7326 *p++ = hexdigits[ch & 0x0000000F];
7327 continue;
7328 }
7329#else
7330 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7331 else if (ch >= 0xD800 && ch < 0xDC00) {
7332 Py_UNICODE ch2;
7333 Py_UCS4 ucs;
7334
7335 ch2 = *s++;
7336 size--;
7337 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7338 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7339 *p++ = '\\';
7340 *p++ = 'U';
7341 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7342 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7343 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7344 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7345 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7346 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7347 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7348 *p++ = hexdigits[ucs & 0x0000000F];
7349 continue;
7350 }
7351 /* Fall through: isolated surrogates are copied as-is */
7352 s--;
7353 size++;
7354 }
7355#endif
7356
7357 /* Map 16-bit characters to '\uxxxx' */
7358 if (ch >= 256) {
7359 *p++ = '\\';
7360 *p++ = 'u';
7361 *p++ = hexdigits[(ch >> 12) & 0x000F];
7362 *p++ = hexdigits[(ch >> 8) & 0x000F];
7363 *p++ = hexdigits[(ch >> 4) & 0x000F];
7364 *p++ = hexdigits[ch & 0x000F];
7365 }
7366
7367 /* Map special whitespace to '\t', \n', '\r' */
7368 else if (ch == '\t') {
7369 *p++ = '\\';
7370 *p++ = 't';
7371 }
7372 else if (ch == '\n') {
7373 *p++ = '\\';
7374 *p++ = 'n';
7375 }
7376 else if (ch == '\r') {
7377 *p++ = '\\';
7378 *p++ = 'r';
7379 }
7380
7381 /* Map non-printable US ASCII to '\xhh' */
7382 else if (ch < ' ' || ch >= 0x7F) {
7383 *p++ = '\\';
7384 *p++ = 'x';
7385 *p++ = hexdigits[(ch >> 4) & 0x000F];
7386 *p++ = hexdigits[ch & 0x000F];
7387 }
7388
7389 /* Copy everything else as-is */
7390 else
7391 *p++ = (char) ch;
7392 }
7393 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007394 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007395
7396 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007397 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007398 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399}
7400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007401PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402"S.rfind(sub [,start [,end]]) -> int\n\
7403\n\
7404Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007405such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406arguments start and end are interpreted as in slice notation.\n\
7407\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007408Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409
7410static PyObject *
7411unicode_rfind(PyUnicodeObject *self, PyObject *args)
7412{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007413 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007414 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007415 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007416 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417
Guido van Rossumb8872e62000-05-09 14:14:27 +00007418 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7419 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007421 substring = PyUnicode_FromObject(substring);
7422 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423 return NULL;
7424
Thomas Wouters477c8d52006-05-27 19:21:47 +00007425 result = stringlib_rfind_slice(
7426 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7427 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7428 start, end
7429 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430
7431 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007432
7433 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434}
7435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007436PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437"S.rindex(sub [,start [,end]]) -> int\n\
7438\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007439Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440
7441static PyObject *
7442unicode_rindex(PyUnicodeObject *self, PyObject *args)
7443{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007444 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007445 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007446 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007447 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448
Guido van Rossumb8872e62000-05-09 14:14:27 +00007449 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7450 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007452 substring = PyUnicode_FromObject(substring);
7453 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 return NULL;
7455
Thomas Wouters477c8d52006-05-27 19:21:47 +00007456 result = stringlib_rfind_slice(
7457 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7458 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7459 start, end
7460 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461
7462 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007463
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464 if (result < 0) {
7465 PyErr_SetString(PyExc_ValueError, "substring not found");
7466 return NULL;
7467 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007468 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469}
7470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007471PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007472"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473\n\
7474Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007475done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476
7477static PyObject *
7478unicode_rjust(PyUnicodeObject *self, PyObject *args)
7479{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007480 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007481 Py_UNICODE fillchar = ' ';
7482
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007483 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484 return NULL;
7485
Tim Peters7a29bd52001-09-12 03:03:31 +00007486 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487 Py_INCREF(self);
7488 return (PyObject*) self;
7489 }
7490
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007491 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492}
7493
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007495unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496{
7497 /* standard clamping */
7498 if (start < 0)
7499 start = 0;
7500 if (end < 0)
7501 end = 0;
7502 if (end > self->length)
7503 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007504 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 /* full slice, return original string */
7506 Py_INCREF(self);
7507 return (PyObject*) self;
7508 }
7509 if (start > end)
7510 start = end;
7511 /* copy slice */
7512 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7513 end - start);
7514}
7515
7516PyObject *PyUnicode_Split(PyObject *s,
7517 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007518 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519{
7520 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007521
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522 s = PyUnicode_FromObject(s);
7523 if (s == NULL)
7524 return NULL;
7525 if (sep != NULL) {
7526 sep = PyUnicode_FromObject(sep);
7527 if (sep == NULL) {
7528 Py_DECREF(s);
7529 return NULL;
7530 }
7531 }
7532
7533 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7534
7535 Py_DECREF(s);
7536 Py_XDECREF(sep);
7537 return result;
7538}
7539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007540PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541"S.split([sep [,maxsplit]]) -> list of strings\n\
7542\n\
7543Return a list of the words in S, using sep as the\n\
7544delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007545splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007546any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547
7548static PyObject*
7549unicode_split(PyUnicodeObject *self, PyObject *args)
7550{
7551 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007552 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553
Martin v. Löwis18e16552006-02-15 17:27:45 +00007554 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555 return NULL;
7556
7557 if (substring == Py_None)
7558 return split(self, NULL, maxcount);
7559 else if (PyUnicode_Check(substring))
7560 return split(self, (PyUnicodeObject *)substring, maxcount);
7561 else
7562 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7563}
7564
Thomas Wouters477c8d52006-05-27 19:21:47 +00007565PyObject *
7566PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7567{
7568 PyObject* str_obj;
7569 PyObject* sep_obj;
7570 PyObject* out;
7571
7572 str_obj = PyUnicode_FromObject(str_in);
7573 if (!str_obj)
7574 return NULL;
7575 sep_obj = PyUnicode_FromObject(sep_in);
7576 if (!sep_obj) {
7577 Py_DECREF(str_obj);
7578 return NULL;
7579 }
7580
7581 out = stringlib_partition(
7582 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7583 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7584 );
7585
7586 Py_DECREF(sep_obj);
7587 Py_DECREF(str_obj);
7588
7589 return out;
7590}
7591
7592
7593PyObject *
7594PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7595{
7596 PyObject* str_obj;
7597 PyObject* sep_obj;
7598 PyObject* out;
7599
7600 str_obj = PyUnicode_FromObject(str_in);
7601 if (!str_obj)
7602 return NULL;
7603 sep_obj = PyUnicode_FromObject(sep_in);
7604 if (!sep_obj) {
7605 Py_DECREF(str_obj);
7606 return NULL;
7607 }
7608
7609 out = stringlib_rpartition(
7610 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7611 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7612 );
7613
7614 Py_DECREF(sep_obj);
7615 Py_DECREF(str_obj);
7616
7617 return out;
7618}
7619
7620PyDoc_STRVAR(partition__doc__,
7621"S.partition(sep) -> (head, sep, tail)\n\
7622\n\
7623Searches for the separator sep in S, and returns the part before it,\n\
7624the separator itself, and the part after it. If the separator is not\n\
7625found, returns S and two empty strings.");
7626
7627static PyObject*
7628unicode_partition(PyUnicodeObject *self, PyObject *separator)
7629{
7630 return PyUnicode_Partition((PyObject *)self, separator);
7631}
7632
7633PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007634"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007635\n\
7636Searches for the separator sep in S, starting at the end of S, and returns\n\
7637the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007638separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007639
7640static PyObject*
7641unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7642{
7643 return PyUnicode_RPartition((PyObject *)self, separator);
7644}
7645
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007646PyObject *PyUnicode_RSplit(PyObject *s,
7647 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007648 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007649{
7650 PyObject *result;
7651
7652 s = PyUnicode_FromObject(s);
7653 if (s == NULL)
7654 return NULL;
7655 if (sep != NULL) {
7656 sep = PyUnicode_FromObject(sep);
7657 if (sep == NULL) {
7658 Py_DECREF(s);
7659 return NULL;
7660 }
7661 }
7662
7663 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7664
7665 Py_DECREF(s);
7666 Py_XDECREF(sep);
7667 return result;
7668}
7669
7670PyDoc_STRVAR(rsplit__doc__,
7671"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7672\n\
7673Return a list of the words in S, using sep as the\n\
7674delimiter string, starting at the end of the string and\n\
7675working to the front. If maxsplit is given, at most maxsplit\n\
7676splits are done. If sep is not specified, any whitespace string\n\
7677is a separator.");
7678
7679static PyObject*
7680unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7681{
7682 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007683 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007684
Martin v. Löwis18e16552006-02-15 17:27:45 +00007685 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007686 return NULL;
7687
7688 if (substring == Py_None)
7689 return rsplit(self, NULL, maxcount);
7690 else if (PyUnicode_Check(substring))
7691 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7692 else
7693 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7694}
7695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007696PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007697"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698\n\
7699Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007700Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007701is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702
7703static PyObject*
7704unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7705{
Guido van Rossum86662912000-04-11 15:38:46 +00007706 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707
Guido van Rossum86662912000-04-11 15:38:46 +00007708 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709 return NULL;
7710
Guido van Rossum86662912000-04-11 15:38:46 +00007711 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712}
7713
7714static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007715PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716{
Walter Dörwald346737f2007-05-31 10:44:43 +00007717 if (PyUnicode_CheckExact(self)) {
7718 Py_INCREF(self);
7719 return self;
7720 } else
7721 /* Subtype -- return genuine unicode string with the same value. */
7722 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7723 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724}
7725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007726PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727"S.swapcase() -> unicode\n\
7728\n\
7729Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007730and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731
7732static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007733unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 return fixup(self, fixswapcase);
7736}
7737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007738PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739"S.translate(table) -> unicode\n\
7740\n\
7741Return a copy of the string S, where all characters have been mapped\n\
7742through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007743Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7744Unmapped characters are left untouched. Characters mapped to None\n\
7745are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746
7747static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007748unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749{
Tim Petersced69f82003-09-16 20:30:58 +00007750 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007752 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753 "ignore");
7754}
7755
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007756PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757"S.upper() -> unicode\n\
7758\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007759Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760
7761static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007762unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764 return fixup(self, fixupper);
7765}
7766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007767PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768"S.zfill(width) -> unicode\n\
7769\n\
7770Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007771of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772
7773static PyObject *
7774unicode_zfill(PyUnicodeObject *self, PyObject *args)
7775{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007776 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777 PyUnicodeObject *u;
7778
Martin v. Löwis18e16552006-02-15 17:27:45 +00007779 Py_ssize_t width;
7780 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781 return NULL;
7782
7783 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007784 if (PyUnicode_CheckExact(self)) {
7785 Py_INCREF(self);
7786 return (PyObject*) self;
7787 }
7788 else
7789 return PyUnicode_FromUnicode(
7790 PyUnicode_AS_UNICODE(self),
7791 PyUnicode_GET_SIZE(self)
7792 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 }
7794
7795 fill = width - self->length;
7796
7797 u = pad(self, fill, 0, '0');
7798
Walter Dörwald068325e2002-04-15 13:36:47 +00007799 if (u == NULL)
7800 return NULL;
7801
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 if (u->str[fill] == '+' || u->str[fill] == '-') {
7803 /* move sign to beginning of string */
7804 u->str[0] = u->str[fill];
7805 u->str[fill] = '0';
7806 }
7807
7808 return (PyObject*) u;
7809}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810
7811#if 0
7812static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007813unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815 return PyInt_FromLong(unicode_freelist_size);
7816}
7817#endif
7818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007819PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007820"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007822Return True if S starts with the specified prefix, False otherwise.\n\
7823With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007824With optional end, stop comparing S at that position.\n\
7825prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826
7827static PyObject *
7828unicode_startswith(PyUnicodeObject *self,
7829 PyObject *args)
7830{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007831 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007833 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007834 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007835 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007837 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007838 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007840 if (PyTuple_Check(subobj)) {
7841 Py_ssize_t i;
7842 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7843 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7844 PyTuple_GET_ITEM(subobj, i));
7845 if (substring == NULL)
7846 return NULL;
7847 result = tailmatch(self, substring, start, end, -1);
7848 Py_DECREF(substring);
7849 if (result) {
7850 Py_RETURN_TRUE;
7851 }
7852 }
7853 /* nothing matched */
7854 Py_RETURN_FALSE;
7855 }
7856 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007858 return NULL;
7859 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007861 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862}
7863
7864
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007865PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007866"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007868Return True if S ends with the specified suffix, False otherwise.\n\
7869With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007870With optional end, stop comparing S at that position.\n\
7871suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872
7873static PyObject *
7874unicode_endswith(PyUnicodeObject *self,
7875 PyObject *args)
7876{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007877 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007879 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007880 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007881 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007883 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7884 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007885 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007886 if (PyTuple_Check(subobj)) {
7887 Py_ssize_t i;
7888 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7889 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7890 PyTuple_GET_ITEM(subobj, i));
7891 if (substring == NULL)
7892 return NULL;
7893 result = tailmatch(self, substring, start, end, +1);
7894 Py_DECREF(substring);
7895 if (result) {
7896 Py_RETURN_TRUE;
7897 }
7898 }
7899 Py_RETURN_FALSE;
7900 }
7901 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007903 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007905 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007906 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007907 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908}
7909
Eric Smith8c663262007-08-25 02:26:07 +00007910#include "stringlib/string_format.h"
7911
7912PyDoc_STRVAR(format__doc__,
7913"S.format(*args, **kwargs) -> unicode\n\
7914\n\
7915");
7916
7917static PyObject *
7918unicode_format(PyObject *self, PyObject *args, PyObject *kwds)
7919{
7920 /* this calls into stringlib/string_format.h because it can be
7921 included for either string or unicode. this is needed for
7922 python 2.6. */
7923 return do_string_format(self, args, kwds);
7924}
7925
7926
7927PyDoc_STRVAR(p_format__doc__,
7928"S.__format__(format_spec) -> unicode\n\
7929\n\
7930");
7931
7932static PyObject *
7933unicode__format__(PyObject *self, PyObject *args)
7934{
7935 return unicode_unicode__format__(self, args);
7936}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007937
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007938
7939static PyObject *
7940unicode_getnewargs(PyUnicodeObject *v)
7941{
7942 return Py_BuildValue("(u#)", v->str, v->length);
7943}
7944
7945
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946static PyMethodDef unicode_methods[] = {
7947
7948 /* Order is according to common usage: often used methods should
7949 appear first, since lookup is done sequentially. */
7950
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007951 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7952 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7953 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007954 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007955 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7956 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7957 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7958 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7959 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7960 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7961 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007962 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007963 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7964 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7965 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007966 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007967 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007968/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7969 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7970 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7971 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007972 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007973 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007974 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007975 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007976 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7977 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7978 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7979 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7980 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7981 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7982 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7983 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7984 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7985 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7986 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7987 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7988 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7989 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00007990 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007991 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith8c663262007-08-25 02:26:07 +00007992 {"format", (PyCFunction) unicode_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7993 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00007994 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7995 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00007996#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007997 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998#endif
7999
8000#if 0
8001 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008002 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003#endif
8004
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008005 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006 {NULL, NULL}
8007};
8008
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008009static PyObject *
8010unicode_mod(PyObject *v, PyObject *w)
8011{
8012 if (!PyUnicode_Check(v)) {
8013 Py_INCREF(Py_NotImplemented);
8014 return Py_NotImplemented;
8015 }
8016 return PyUnicode_Format(v, w);
8017}
8018
8019static PyNumberMethods unicode_as_number = {
8020 0, /*nb_add*/
8021 0, /*nb_subtract*/
8022 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008023 unicode_mod, /*nb_remainder*/
8024};
8025
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008027 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008028 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008029 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8030 (ssizeargfunc) unicode_getitem, /* sq_item */
8031 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 0, /* sq_ass_item */
8033 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008034 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035};
8036
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008037static PyObject*
8038unicode_subscript(PyUnicodeObject* self, PyObject* item)
8039{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008040 if (PyIndex_Check(item)) {
8041 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008042 if (i == -1 && PyErr_Occurred())
8043 return NULL;
8044 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008045 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008046 return unicode_getitem(self, i);
8047 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008048 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008049 Py_UNICODE* source_buf;
8050 Py_UNICODE* result_buf;
8051 PyObject* result;
8052
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008053 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008054 &start, &stop, &step, &slicelength) < 0) {
8055 return NULL;
8056 }
8057
8058 if (slicelength <= 0) {
8059 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008060 } else if (start == 0 && step == 1 && slicelength == self->length &&
8061 PyUnicode_CheckExact(self)) {
8062 Py_INCREF(self);
8063 return (PyObject *)self;
8064 } else if (step == 1) {
8065 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008066 } else {
8067 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008068 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8069 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008070
8071 if (result_buf == NULL)
8072 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008073
8074 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8075 result_buf[i] = source_buf[cur];
8076 }
Tim Petersced69f82003-09-16 20:30:58 +00008077
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008078 result = PyUnicode_FromUnicode(result_buf, slicelength);
8079 PyMem_FREE(result_buf);
8080 return result;
8081 }
8082 } else {
8083 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8084 return NULL;
8085 }
8086}
8087
8088static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008089 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008090 (binaryfunc)unicode_subscript, /* mp_subscript */
8091 (objobjargproc)0, /* mp_ass_subscript */
8092};
8093
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094
8095static int
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008096unicode_buffer_getbuffer(PyUnicodeObject *self, PyBuffer *view, int flags)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008099 if (flags & PyBUF_CHARACTER) {
Guido van Rossuma74184e2007-08-29 04:05:57 +00008100 PyErr_SetString(PyExc_SystemError, "can't use str as char buffer");
8101 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102 }
Guido van Rossuma74184e2007-08-29 04:05:57 +00008103 return PyBuffer_FillInfo(view, (void *)self->str,
8104 PyUnicode_GET_DATA_SIZE(self), 1, flags);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105}
8106
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008107
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108/* Helpers for PyUnicode_Format() */
8109
8110static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008111getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008113 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114 if (argidx < arglen) {
8115 (*p_argidx)++;
8116 if (arglen < 0)
8117 return args;
8118 else
8119 return PyTuple_GetItem(args, argidx);
8120 }
8121 PyErr_SetString(PyExc_TypeError,
8122 "not enough arguments for format string");
8123 return NULL;
8124}
8125
8126#define F_LJUST (1<<0)
8127#define F_SIGN (1<<1)
8128#define F_BLANK (1<<2)
8129#define F_ALT (1<<3)
8130#define F_ZERO (1<<4)
8131
Martin v. Löwis18e16552006-02-15 17:27:45 +00008132static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008133strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008135 register Py_ssize_t i;
8136 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 for (i = len - 1; i >= 0; i--)
8138 buffer[i] = (Py_UNICODE) charbuffer[i];
8139
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 return len;
8141}
8142
Neal Norwitzfc76d632006-01-10 06:03:13 +00008143static int
8144doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8145{
Tim Peters15231542006-02-16 01:08:01 +00008146 Py_ssize_t result;
8147
Neal Norwitzfc76d632006-01-10 06:03:13 +00008148 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008149 result = strtounicode(buffer, (char *)buffer);
8150 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008151}
8152
8153static int
8154longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8155{
Tim Peters15231542006-02-16 01:08:01 +00008156 Py_ssize_t result;
8157
Neal Norwitzfc76d632006-01-10 06:03:13 +00008158 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008159 result = strtounicode(buffer, (char *)buffer);
8160 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008161}
8162
Guido van Rossum078151d2002-08-11 04:24:12 +00008163/* XXX To save some code duplication, formatfloat/long/int could have been
8164 shared with stringobject.c, converting from 8-bit to Unicode after the
8165 formatting is done. */
8166
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167static int
8168formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008169 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170 int flags,
8171 int prec,
8172 int type,
8173 PyObject *v)
8174{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008175 /* fmt = '%#.' + `prec` + `type`
8176 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177 char fmt[20];
8178 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008179
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180 x = PyFloat_AsDouble(v);
8181 if (x == -1.0 && PyErr_Occurred())
8182 return -1;
8183 if (prec < 0)
8184 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8186 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008187 /* Worst case length calc to ensure no buffer overrun:
8188
8189 'g' formats:
8190 fmt = %#.<prec>g
8191 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8192 for any double rep.)
8193 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8194
8195 'f' formats:
8196 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8197 len = 1 + 50 + 1 + prec = 52 + prec
8198
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008199 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008200 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008201
8202 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008203 if (((type == 'g' || type == 'G') &&
8204 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008205 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008206 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008207 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008208 return -1;
8209 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008210 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8211 (flags&F_ALT) ? "#" : "",
8212 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008213 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008214}
8215
Tim Peters38fd5b62000-09-21 05:43:11 +00008216static PyObject*
8217formatlong(PyObject *val, int flags, int prec, int type)
8218{
8219 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008220 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008221 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008222 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008223
8224 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8225 if (!str)
8226 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008227 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008228 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008229 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008230}
8231
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232static int
8233formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008234 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235 int flags,
8236 int prec,
8237 int type,
8238 PyObject *v)
8239{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008240 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008241 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8242 * + 1 + 1
8243 * = 24
8244 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008245 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008246 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 long x;
8248
8249 x = PyInt_AsLong(v);
8250 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008251 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008252 if (x < 0 && type == 'u') {
8253 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008254 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008255 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8256 sign = "-";
8257 else
8258 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008260 prec = 1;
8261
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008262 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8263 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008264 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008265 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008266 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008267 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008268 return -1;
8269 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008270
8271 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008272 (type == 'x' || type == 'X' || type == 'o')) {
8273 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008274 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008275 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008276 * - when 0 is being converted, the C standard leaves off
8277 * the '0x' or '0X', which is inconsistent with other
8278 * %#x/%#X conversions and inconsistent with Python's
8279 * hex() function
8280 * - there are platforms that violate the standard and
8281 * convert 0 with the '0x' or '0X'
8282 * (Metrowerks, Compaq Tru64)
8283 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008284 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008285 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008286 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008287 * We can achieve the desired consistency by inserting our
8288 * own '0x' or '0X' prefix, and substituting %x/%X in place
8289 * of %#x/%#X.
8290 *
8291 * Note that this is the same approach as used in
8292 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008293 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008294 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8295 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008296 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008297 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008298 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8299 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008300 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008301 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008302 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008303 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008304 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008305 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306}
8307
8308static int
8309formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008310 size_t buflen,
8311 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008313 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008314 if (PyUnicode_Check(v)) {
8315 if (PyUnicode_GET_SIZE(v) != 1)
8316 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008320 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008321 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008322 goto onError;
8323 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8324 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325
8326 else {
8327 /* Integer input truncated to a character */
8328 long x;
8329 x = PyInt_AsLong(v);
8330 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008331 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008332#ifdef Py_UNICODE_WIDE
8333 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008334 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008335 "%c arg not in range(0x110000) "
8336 "(wide Python build)");
8337 return -1;
8338 }
8339#else
8340 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008341 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008342 "%c arg not in range(0x10000) "
8343 "(narrow Python build)");
8344 return -1;
8345 }
8346#endif
8347 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 }
8349 buf[1] = '\0';
8350 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008351
8352 onError:
8353 PyErr_SetString(PyExc_TypeError,
8354 "%c requires int or char");
8355 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356}
8357
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008358/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8359
8360 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8361 chars are formatted. XXX This is a magic number. Each formatting
8362 routine does bounds checking to ensure no overflow, but a better
8363 solution may be to malloc a buffer of appropriate size for each
8364 format. For now, the current solution is sufficient.
8365*/
8366#define FORMATBUFLEN (size_t)120
8367
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368PyObject *PyUnicode_Format(PyObject *format,
8369 PyObject *args)
8370{
8371 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008372 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373 int args_owned = 0;
8374 PyUnicodeObject *result = NULL;
8375 PyObject *dict = NULL;
8376 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008377
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 if (format == NULL || args == NULL) {
8379 PyErr_BadInternalCall();
8380 return NULL;
8381 }
8382 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008383 if (uformat == NULL)
8384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385 fmt = PyUnicode_AS_UNICODE(uformat);
8386 fmtcnt = PyUnicode_GET_SIZE(uformat);
8387
8388 reslen = rescnt = fmtcnt + 100;
8389 result = _PyUnicode_New(reslen);
8390 if (result == NULL)
8391 goto onError;
8392 res = PyUnicode_AS_UNICODE(result);
8393
8394 if (PyTuple_Check(args)) {
8395 arglen = PyTuple_Size(args);
8396 argidx = 0;
8397 }
8398 else {
8399 arglen = -1;
8400 argidx = -2;
8401 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008402 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008403 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404 dict = args;
8405
8406 while (--fmtcnt >= 0) {
8407 if (*fmt != '%') {
8408 if (--rescnt < 0) {
8409 rescnt = fmtcnt + 100;
8410 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008411 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008412 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8414 --rescnt;
8415 }
8416 *res++ = *fmt++;
8417 }
8418 else {
8419 /* Got a format specifier */
8420 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008421 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 Py_UNICODE c = '\0';
8424 Py_UNICODE fill;
8425 PyObject *v = NULL;
8426 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008427 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008429 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008430 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431
8432 fmt++;
8433 if (*fmt == '(') {
8434 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008435 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436 PyObject *key;
8437 int pcount = 1;
8438
8439 if (dict == NULL) {
8440 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008441 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442 goto onError;
8443 }
8444 ++fmt;
8445 --fmtcnt;
8446 keystart = fmt;
8447 /* Skip over balanced parentheses */
8448 while (pcount > 0 && --fmtcnt >= 0) {
8449 if (*fmt == ')')
8450 --pcount;
8451 else if (*fmt == '(')
8452 ++pcount;
8453 fmt++;
8454 }
8455 keylen = fmt - keystart - 1;
8456 if (fmtcnt < 0 || pcount > 0) {
8457 PyErr_SetString(PyExc_ValueError,
8458 "incomplete format key");
8459 goto onError;
8460 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008461#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008462 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463 then looked up since Python uses strings to hold
8464 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008465 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466 key = PyUnicode_EncodeUTF8(keystart,
8467 keylen,
8468 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008469#else
8470 key = PyUnicode_FromUnicode(keystart, keylen);
8471#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472 if (key == NULL)
8473 goto onError;
8474 if (args_owned) {
8475 Py_DECREF(args);
8476 args_owned = 0;
8477 }
8478 args = PyObject_GetItem(dict, key);
8479 Py_DECREF(key);
8480 if (args == NULL) {
8481 goto onError;
8482 }
8483 args_owned = 1;
8484 arglen = -1;
8485 argidx = -2;
8486 }
8487 while (--fmtcnt >= 0) {
8488 switch (c = *fmt++) {
8489 case '-': flags |= F_LJUST; continue;
8490 case '+': flags |= F_SIGN; continue;
8491 case ' ': flags |= F_BLANK; continue;
8492 case '#': flags |= F_ALT; continue;
8493 case '0': flags |= F_ZERO; continue;
8494 }
8495 break;
8496 }
8497 if (c == '*') {
8498 v = getnextarg(args, arglen, &argidx);
8499 if (v == NULL)
8500 goto onError;
8501 if (!PyInt_Check(v)) {
8502 PyErr_SetString(PyExc_TypeError,
8503 "* wants int");
8504 goto onError;
8505 }
8506 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008507 if (width == -1 && PyErr_Occurred())
8508 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509 if (width < 0) {
8510 flags |= F_LJUST;
8511 width = -width;
8512 }
8513 if (--fmtcnt >= 0)
8514 c = *fmt++;
8515 }
8516 else if (c >= '0' && c <= '9') {
8517 width = c - '0';
8518 while (--fmtcnt >= 0) {
8519 c = *fmt++;
8520 if (c < '0' || c > '9')
8521 break;
8522 if ((width*10) / 10 != width) {
8523 PyErr_SetString(PyExc_ValueError,
8524 "width too big");
8525 goto onError;
8526 }
8527 width = width*10 + (c - '0');
8528 }
8529 }
8530 if (c == '.') {
8531 prec = 0;
8532 if (--fmtcnt >= 0)
8533 c = *fmt++;
8534 if (c == '*') {
8535 v = getnextarg(args, arglen, &argidx);
8536 if (v == NULL)
8537 goto onError;
8538 if (!PyInt_Check(v)) {
8539 PyErr_SetString(PyExc_TypeError,
8540 "* wants int");
8541 goto onError;
8542 }
8543 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008544 if (prec == -1 && PyErr_Occurred())
8545 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546 if (prec < 0)
8547 prec = 0;
8548 if (--fmtcnt >= 0)
8549 c = *fmt++;
8550 }
8551 else if (c >= '0' && c <= '9') {
8552 prec = c - '0';
8553 while (--fmtcnt >= 0) {
8554 c = Py_CHARMASK(*fmt++);
8555 if (c < '0' || c > '9')
8556 break;
8557 if ((prec*10) / 10 != prec) {
8558 PyErr_SetString(PyExc_ValueError,
8559 "prec too big");
8560 goto onError;
8561 }
8562 prec = prec*10 + (c - '0');
8563 }
8564 }
8565 } /* prec */
8566 if (fmtcnt >= 0) {
8567 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568 if (--fmtcnt >= 0)
8569 c = *fmt++;
8570 }
8571 }
8572 if (fmtcnt < 0) {
8573 PyErr_SetString(PyExc_ValueError,
8574 "incomplete format");
8575 goto onError;
8576 }
8577 if (c != '%') {
8578 v = getnextarg(args, arglen, &argidx);
8579 if (v == NULL)
8580 goto onError;
8581 }
8582 sign = 0;
8583 fill = ' ';
8584 switch (c) {
8585
8586 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008587 pbuf = formatbuf;
8588 /* presume that buffer length is at least 1 */
8589 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590 len = 1;
8591 break;
8592
8593 case 's':
8594 case 'r':
8595 if (PyUnicode_Check(v) && c == 's') {
8596 temp = v;
8597 Py_INCREF(temp);
8598 }
8599 else {
8600 PyObject *unicode;
8601 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008602 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603 else
8604 temp = PyObject_Repr(v);
8605 if (temp == NULL)
8606 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008607 if (PyUnicode_Check(temp))
8608 /* nothing to do */;
8609 else if (PyString_Check(temp)) {
8610 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008611 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008613 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008615 Py_DECREF(temp);
8616 temp = unicode;
8617 if (temp == NULL)
8618 goto onError;
8619 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008620 else {
8621 Py_DECREF(temp);
8622 PyErr_SetString(PyExc_TypeError,
8623 "%s argument has non-string str()");
8624 goto onError;
8625 }
8626 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008627 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628 len = PyUnicode_GET_SIZE(temp);
8629 if (prec >= 0 && len > prec)
8630 len = prec;
8631 break;
8632
8633 case 'i':
8634 case 'd':
8635 case 'u':
8636 case 'o':
8637 case 'x':
8638 case 'X':
8639 if (c == 'i')
8640 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008641 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008642 temp = formatlong(v, flags, prec, c);
8643 if (!temp)
8644 goto onError;
8645 pbuf = PyUnicode_AS_UNICODE(temp);
8646 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008647 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008649 else {
8650 pbuf = formatbuf;
8651 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8652 flags, prec, c, v);
8653 if (len < 0)
8654 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008655 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008656 }
8657 if (flags & F_ZERO)
8658 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 break;
8660
8661 case 'e':
8662 case 'E':
8663 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008664 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 case 'g':
8666 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008667 if (c == 'F')
8668 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008669 pbuf = formatbuf;
8670 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8671 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 if (len < 0)
8673 goto onError;
8674 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008675 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676 fill = '0';
8677 break;
8678
8679 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008680 pbuf = formatbuf;
8681 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 if (len < 0)
8683 goto onError;
8684 break;
8685
8686 default:
8687 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008688 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008689 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008690 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008691 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008692 (Py_ssize_t)(fmt - 1 -
8693 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694 goto onError;
8695 }
8696 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008697 if (*pbuf == '-' || *pbuf == '+') {
8698 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699 len--;
8700 }
8701 else if (flags & F_SIGN)
8702 sign = '+';
8703 else if (flags & F_BLANK)
8704 sign = ' ';
8705 else
8706 sign = 0;
8707 }
8708 if (width < len)
8709 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008710 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711 reslen -= rescnt;
8712 rescnt = width + fmtcnt + 100;
8713 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008714 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008715 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008716 PyErr_NoMemory();
8717 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008718 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008719 if (_PyUnicode_Resize(&result, reslen) < 0) {
8720 Py_XDECREF(temp);
8721 goto onError;
8722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 res = PyUnicode_AS_UNICODE(result)
8724 + reslen - rescnt;
8725 }
8726 if (sign) {
8727 if (fill != ' ')
8728 *res++ = sign;
8729 rescnt--;
8730 if (width > len)
8731 width--;
8732 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008733 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008734 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008735 assert(pbuf[1] == c);
8736 if (fill != ' ') {
8737 *res++ = *pbuf++;
8738 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008739 }
Tim Petersfff53252001-04-12 18:38:48 +00008740 rescnt -= 2;
8741 width -= 2;
8742 if (width < 0)
8743 width = 0;
8744 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746 if (width > len && !(flags & F_LJUST)) {
8747 do {
8748 --rescnt;
8749 *res++ = fill;
8750 } while (--width > len);
8751 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008752 if (fill == ' ') {
8753 if (sign)
8754 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008755 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008756 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008757 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008758 *res++ = *pbuf++;
8759 *res++ = *pbuf++;
8760 }
8761 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008762 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763 res += len;
8764 rescnt -= len;
8765 while (--width >= len) {
8766 --rescnt;
8767 *res++ = ' ';
8768 }
8769 if (dict && (argidx < arglen) && c != '%') {
8770 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008771 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008772 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773 goto onError;
8774 }
8775 Py_XDECREF(temp);
8776 } /* '%' */
8777 } /* until end */
8778 if (argidx < arglen && !dict) {
8779 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008780 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781 goto onError;
8782 }
8783
Thomas Woutersa96affe2006-03-12 00:29:36 +00008784 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8785 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786 if (args_owned) {
8787 Py_DECREF(args);
8788 }
8789 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790 return (PyObject *)result;
8791
8792 onError:
8793 Py_XDECREF(result);
8794 Py_DECREF(uformat);
8795 if (args_owned) {
8796 Py_DECREF(args);
8797 }
8798 return NULL;
8799}
8800
8801static PyBufferProcs unicode_as_buffer = {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008802 (getbufferproc) unicode_buffer_getbuffer,
8803 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804};
8805
Jeremy Hylton938ace62002-07-17 16:30:39 +00008806static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008807unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8808
Tim Peters6d6c1a32001-08-02 04:15:00 +00008809static PyObject *
8810unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8811{
8812 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008813 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008814 char *encoding = NULL;
8815 char *errors = NULL;
8816
Guido van Rossume023fe02001-08-30 03:12:59 +00008817 if (type != &PyUnicode_Type)
8818 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008819 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8820 kwlist, &x, &encoding, &errors))
8821 return NULL;
8822 if (x == NULL)
8823 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008824 if (encoding == NULL && errors == NULL)
8825 return PyObject_Unicode(x);
8826 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008827 return PyUnicode_FromEncodedObject(x, encoding, errors);
8828}
8829
Guido van Rossume023fe02001-08-30 03:12:59 +00008830static PyObject *
8831unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8832{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008833 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008834 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008835
8836 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8837 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8838 if (tmp == NULL)
8839 return NULL;
8840 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008841 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008842 if (pnew == NULL) {
8843 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008844 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008845 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008846 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8847 if (pnew->str == NULL) {
8848 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008849 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008850 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008851 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008852 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008853 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8854 pnew->length = n;
8855 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008856 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008857 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008858}
8859
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008860PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008861"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008862\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008863Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008864encoding defaults to the current default string encoding.\n\
8865errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008866
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008867static PyObject *unicode_iter(PyObject *seq);
8868
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008870 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008871 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 sizeof(PyUnicodeObject), /* tp_size */
8873 0, /* tp_itemsize */
8874 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008875 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008877 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008879 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008880 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008881 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008883 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884 (hashfunc) unicode_hash, /* tp_hash*/
8885 0, /* tp_call*/
8886 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008887 PyObject_GenericGetAttr, /* tp_getattro */
8888 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008890 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8891 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008892 unicode_doc, /* tp_doc */
8893 0, /* tp_traverse */
8894 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008895 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008896 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008897 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008898 0, /* tp_iternext */
8899 unicode_methods, /* tp_methods */
8900 0, /* tp_members */
8901 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008902 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008903 0, /* tp_dict */
8904 0, /* tp_descr_get */
8905 0, /* tp_descr_set */
8906 0, /* tp_dictoffset */
8907 0, /* tp_init */
8908 0, /* tp_alloc */
8909 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008910 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911};
8912
8913/* Initialize the Unicode implementation */
8914
Thomas Wouters78890102000-07-22 19:25:51 +00008915void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008917 int i;
8918
Thomas Wouters477c8d52006-05-27 19:21:47 +00008919 /* XXX - move this array to unicodectype.c ? */
8920 Py_UNICODE linebreak[] = {
8921 0x000A, /* LINE FEED */
8922 0x000D, /* CARRIAGE RETURN */
8923 0x001C, /* FILE SEPARATOR */
8924 0x001D, /* GROUP SEPARATOR */
8925 0x001E, /* RECORD SEPARATOR */
8926 0x0085, /* NEXT LINE */
8927 0x2028, /* LINE SEPARATOR */
8928 0x2029, /* PARAGRAPH SEPARATOR */
8929 };
8930
Fred Drakee4315f52000-05-09 19:53:39 +00008931 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008932 unicode_freelist = NULL;
8933 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008935 if (!unicode_empty)
8936 return;
8937
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008938 for (i = 0; i < 256; i++)
8939 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008940 if (PyType_Ready(&PyUnicode_Type) < 0)
8941 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008942
8943 /* initialize the linebreak bloom filter */
8944 bloom_linebreak = make_bloom_mask(
8945 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8946 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008947
8948 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949}
8950
8951/* Finalize the Unicode implementation */
8952
8953void
Thomas Wouters78890102000-07-22 19:25:51 +00008954_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008956 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008957 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008959 Py_XDECREF(unicode_empty);
8960 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008961
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008962 for (i = 0; i < 256; i++) {
8963 if (unicode_latin1[i]) {
8964 Py_DECREF(unicode_latin1[i]);
8965 unicode_latin1[i] = NULL;
8966 }
8967 }
8968
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008969 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970 PyUnicodeObject *v = u;
8971 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008972 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008973 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008974 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008975 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008977 unicode_freelist = NULL;
8978 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008980
Walter Dörwald16807132007-05-25 13:52:07 +00008981void
8982PyUnicode_InternInPlace(PyObject **p)
8983{
8984 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8985 PyObject *t;
8986 if (s == NULL || !PyUnicode_Check(s))
8987 Py_FatalError(
8988 "PyUnicode_InternInPlace: unicode strings only please!");
8989 /* If it's a subclass, we don't really know what putting
8990 it in the interned dict might do. */
8991 if (!PyUnicode_CheckExact(s))
8992 return;
8993 if (PyUnicode_CHECK_INTERNED(s))
8994 return;
8995 if (interned == NULL) {
8996 interned = PyDict_New();
8997 if (interned == NULL) {
8998 PyErr_Clear(); /* Don't leave an exception */
8999 return;
9000 }
9001 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009002 /* It might be that the GetItem call fails even
9003 though the key is present in the dictionary,
9004 namely when this happens during a stack overflow. */
9005 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009006 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009007 Py_END_ALLOW_RECURSION
9008
Walter Dörwald16807132007-05-25 13:52:07 +00009009 if (t) {
9010 Py_INCREF(t);
9011 Py_DECREF(*p);
9012 *p = t;
9013 return;
9014 }
9015
Martin v. Löwis5b222132007-06-10 09:51:05 +00009016 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009017 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9018 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009019 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009020 return;
9021 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009022 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009023 /* The two references in interned are not counted by refcnt.
9024 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009025 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009026 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9027}
9028
9029void
9030PyUnicode_InternImmortal(PyObject **p)
9031{
9032 PyUnicode_InternInPlace(p);
9033 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9034 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9035 Py_INCREF(*p);
9036 }
9037}
9038
9039PyObject *
9040PyUnicode_InternFromString(const char *cp)
9041{
9042 PyObject *s = PyUnicode_FromString(cp);
9043 if (s == NULL)
9044 return NULL;
9045 PyUnicode_InternInPlace(&s);
9046 return s;
9047}
9048
9049void _Py_ReleaseInternedUnicodeStrings(void)
9050{
9051 PyObject *keys;
9052 PyUnicodeObject *s;
9053 Py_ssize_t i, n;
9054 Py_ssize_t immortal_size = 0, mortal_size = 0;
9055
9056 if (interned == NULL || !PyDict_Check(interned))
9057 return;
9058 keys = PyDict_Keys(interned);
9059 if (keys == NULL || !PyList_Check(keys)) {
9060 PyErr_Clear();
9061 return;
9062 }
9063
9064 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9065 detector, interned unicode strings are not forcibly deallocated;
9066 rather, we give them their stolen references back, and then clear
9067 and DECREF the interned dict. */
9068
9069 n = PyList_GET_SIZE(keys);
9070 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9071 n);
9072 for (i = 0; i < n; i++) {
9073 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9074 switch (s->state) {
9075 case SSTATE_NOT_INTERNED:
9076 /* XXX Shouldn't happen */
9077 break;
9078 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009079 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009080 immortal_size += s->length;
9081 break;
9082 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009083 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009084 mortal_size += s->length;
9085 break;
9086 default:
9087 Py_FatalError("Inconsistent interned string state.");
9088 }
9089 s->state = SSTATE_NOT_INTERNED;
9090 }
9091 fprintf(stderr, "total size of all interned strings: "
9092 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9093 "mortal/immortal\n", mortal_size, immortal_size);
9094 Py_DECREF(keys);
9095 PyDict_Clear(interned);
9096 Py_DECREF(interned);
9097 interned = NULL;
9098}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009099
9100
9101/********************* Unicode Iterator **************************/
9102
9103typedef struct {
9104 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009105 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009106 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9107} unicodeiterobject;
9108
9109static void
9110unicodeiter_dealloc(unicodeiterobject *it)
9111{
9112 _PyObject_GC_UNTRACK(it);
9113 Py_XDECREF(it->it_seq);
9114 PyObject_GC_Del(it);
9115}
9116
9117static int
9118unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9119{
9120 Py_VISIT(it->it_seq);
9121 return 0;
9122}
9123
9124static PyObject *
9125unicodeiter_next(unicodeiterobject *it)
9126{
9127 PyUnicodeObject *seq;
9128 PyObject *item;
9129
9130 assert(it != NULL);
9131 seq = it->it_seq;
9132 if (seq == NULL)
9133 return NULL;
9134 assert(PyUnicode_Check(seq));
9135
9136 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009137 item = PyUnicode_FromUnicode(
9138 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009139 if (item != NULL)
9140 ++it->it_index;
9141 return item;
9142 }
9143
9144 Py_DECREF(seq);
9145 it->it_seq = NULL;
9146 return NULL;
9147}
9148
9149static PyObject *
9150unicodeiter_len(unicodeiterobject *it)
9151{
9152 Py_ssize_t len = 0;
9153 if (it->it_seq)
9154 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9155 return PyInt_FromSsize_t(len);
9156}
9157
9158PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9159
9160static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009161 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9162 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009163 {NULL, NULL} /* sentinel */
9164};
9165
9166PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009167 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009168 "unicodeiterator", /* tp_name */
9169 sizeof(unicodeiterobject), /* tp_basicsize */
9170 0, /* tp_itemsize */
9171 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009172 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009173 0, /* tp_print */
9174 0, /* tp_getattr */
9175 0, /* tp_setattr */
9176 0, /* tp_compare */
9177 0, /* tp_repr */
9178 0, /* tp_as_number */
9179 0, /* tp_as_sequence */
9180 0, /* tp_as_mapping */
9181 0, /* tp_hash */
9182 0, /* tp_call */
9183 0, /* tp_str */
9184 PyObject_GenericGetAttr, /* tp_getattro */
9185 0, /* tp_setattro */
9186 0, /* tp_as_buffer */
9187 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9188 0, /* tp_doc */
9189 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9190 0, /* tp_clear */
9191 0, /* tp_richcompare */
9192 0, /* tp_weaklistoffset */
9193 PyObject_SelfIter, /* tp_iter */
9194 (iternextfunc)unicodeiter_next, /* tp_iternext */
9195 unicodeiter_methods, /* tp_methods */
9196 0,
9197};
9198
9199static PyObject *
9200unicode_iter(PyObject *seq)
9201{
9202 unicodeiterobject *it;
9203
9204 if (!PyUnicode_Check(seq)) {
9205 PyErr_BadInternalCall();
9206 return NULL;
9207 }
9208 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9209 if (it == NULL)
9210 return NULL;
9211 it->it_index = 0;
9212 Py_INCREF(seq);
9213 it->it_seq = (PyUnicodeObject *)seq;
9214 _PyObject_GC_TRACK(it);
9215 return (PyObject *)it;
9216}
9217
Martin v. Löwis5b222132007-06-10 09:51:05 +00009218size_t
9219Py_UNICODE_strlen(const Py_UNICODE *u)
9220{
9221 int res = 0;
9222 while(*u++)
9223 res++;
9224 return res;
9225}
9226
9227Py_UNICODE*
9228Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9229{
9230 Py_UNICODE *u = s1;
9231 while ((*u++ = *s2++));
9232 return s1;
9233}
9234
9235Py_UNICODE*
9236Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9237{
9238 Py_UNICODE *u = s1;
9239 while ((*u++ = *s2++))
9240 if (n-- == 0)
9241 break;
9242 return s1;
9243}
9244
9245int
9246Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9247{
9248 while (*s1 && *s2 && *s1 == *s2)
9249 s1++, s2++;
9250 if (*s1 && *s2)
9251 return (*s1 < *s2) ? -1 : +1;
9252 if (*s1)
9253 return 1;
9254 if (*s2)
9255 return -1;
9256 return 0;
9257}
9258
9259Py_UNICODE*
9260Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9261{
9262 const Py_UNICODE *p;
9263 for (p = s; *p; p++)
9264 if (*p == c)
9265 return (Py_UNICODE*)p;
9266 return NULL;
9267}
9268
9269
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009270#ifdef __cplusplus
9271}
9272#endif
9273
9274
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009275/*
9276Local variables:
9277c-basic-offset: 4
9278indent-tabs-mode: nil
9279End:
9280*/