blob: f9d3068edf8820d4b1cba670aa08c943a313270d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Eric Smith8c663262007-08-25 02:26:07 +000048#include "formatter_unicode.h"
49
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000050#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000051#include <windows.h>
52#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000053
Guido van Rossumd57fd912000-03-10 22:53:23 +000054/* Limit for the Unicode object free list */
55
56#define MAX_UNICODE_FREELIST_SIZE 1024
57
58/* Limit for the Unicode object free list stay alive optimization.
59
60 The implementation will keep allocated Unicode memory intact for
61 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000062 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Barry Warsaw51ac5802000-03-20 16:36:48 +000064 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000066 malloc()-overhead) bytes of unused garbage.
67
68 Setting the limit to 0 effectively turns the feature off.
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070 Note: This is an experimental feature ! If you get core dumps when
71 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73*/
74
Guido van Rossumfd4b9572000-04-10 13:51:10 +000075#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000076
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085/* --- Globals ------------------------------------------------------------
86
87 The globals are initialized by the _PyUnicode_Init() API and should
88 not be used before calling that API.
89
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Walter Dörwald16807132007-05-25 13:52:07 +000097/* This dictionary holds all interned unicode strings. Note that references
98 to strings in this dictionary are *not* counted in the string's ob_refcnt.
99 When the interned string reaches a refcnt of 0 the string deallocation
100 function will delete the reference from this dictionary.
101
102 Another way to look at this is that to say that the actual reference
103 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
104*/
105static PyObject *interned;
106
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000108static PyUnicodeObject *unicode_freelist;
109static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000111/* The empty Unicode object is shared to improve performance. */
112static PyUnicodeObject *unicode_empty;
113
114/* Single character Unicode strings in the Latin-1 range are being
115 shared as well. */
116static PyUnicodeObject *unicode_latin1[256];
117
Fred Drakee4315f52000-05-09 19:53:39 +0000118/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000119 parameter; it is fixed to "utf-8". Always use the
120 PyUnicode_GetDefaultEncoding() API to access this global. */
121static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000122
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000124PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000126#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000127 return 0x10FFFF;
128#else
129 /* This is actually an illegal character, so it should
130 not be passed to unichr. */
131 return 0xFFFF;
132#endif
133}
134
Thomas Wouters477c8d52006-05-27 19:21:47 +0000135/* --- Bloom Filters ----------------------------------------------------- */
136
137/* stuff to implement simple "bloom filters" for Unicode characters.
138 to keep things simple, we use a single bitmask, using the least 5
139 bits from each unicode characters as the bit index. */
140
141/* the linebreak mask is set up by Unicode_Init below */
142
143#define BLOOM_MASK unsigned long
144
145static BLOOM_MASK bloom_linebreak;
146
147#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
148
149#define BLOOM_LINEBREAK(ch)\
150 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
151
152Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
153{
154 /* calculate simple bloom-style bitmask for a given unicode string */
155
156 long mask;
157 Py_ssize_t i;
158
159 mask = 0;
160 for (i = 0; i < len; i++)
161 mask |= (1 << (ptr[i] & 0x1F));
162
163 return mask;
164}
165
166Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
167{
168 Py_ssize_t i;
169
170 for (i = 0; i < setlen; i++)
171 if (set[i] == chr)
172 return 1;
173
174 return 0;
175}
176
177#define BLOOM_MEMBER(mask, chr, set, setlen)\
178 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
179
Guido van Rossumd57fd912000-03-10 22:53:23 +0000180/* --- Unicode Object ----------------------------------------------------- */
181
182static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000184 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185{
186 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000187
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000191
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192 /* Resizing shared object (unicode_empty or single character
193 objects) in-place is not allowed. Use PyUnicode_Resize()
194 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000195
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000196 if (unicode == unicode_empty ||
197 (unicode->length == 1 &&
198 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 return -1;
203 }
204
Thomas Wouters477c8d52006-05-27 19:21:47 +0000205 /* We allocate one more byte to make sure the string is Ux0000 terminated.
206 The overallocation is also used by fastsearch, which assumes that it's
207 safe to look at str[length] (without making any assumptions about what
208 it contains). */
209
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 oldstr = unicode->str;
211 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
212 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000213 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 PyErr_NoMemory();
215 return -1;
216 }
217 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000222 if (unicode->defenc) {
223 Py_DECREF(unicode->defenc);
224 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000227
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 return 0;
229}
230
231/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000232 Ux0000 terminated; some code (e.g. new_identifier)
233 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234
235 XXX This allocator could further be enhanced by assuring that the
236 free list never reduces its size below 1.
237
238*/
239
240static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000241PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242{
243 register PyUnicodeObject *unicode;
244
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 if (length == 0 && unicode_empty != NULL) {
247 Py_INCREF(unicode_empty);
248 return unicode_empty;
249 }
250
251 /* Unicode freelist & memory allocation */
252 if (unicode_freelist) {
253 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000254 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000257 /* Keep-Alive optimization: we only upsize the buffer,
258 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000259 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000260 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000265 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000267 }
268 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 }
270 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000271 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 if (unicode == NULL)
273 return NULL;
274 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
275 }
276
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 if (!unicode->str) {
278 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000279 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000280 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000281 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000282 * the caller fails before initializing str -- unicode_resize()
283 * reads str[0], and the Keep-Alive optimization can keep memory
284 * allocated for str alive across a call to unicode_dealloc(unicode).
285 * We don't want unicode_resize to read uninitialized memory in
286 * that case.
287 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000288 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000290 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000292 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000293 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000295
296 onError:
297 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000298 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300}
301
302static
Guido van Rossum9475a232001-10-05 20:51:39 +0000303void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304{
Walter Dörwald16807132007-05-25 13:52:07 +0000305 switch (PyUnicode_CHECK_INTERNED(unicode)) {
306 case SSTATE_NOT_INTERNED:
307 break;
308
309 case SSTATE_INTERNED_MORTAL:
310 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000311 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000312 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
313 Py_FatalError(
314 "deletion of interned unicode string failed");
315 break;
316
317 case SSTATE_INTERNED_IMMORTAL:
318 Py_FatalError("Immortal interned unicode string died.");
319
320 default:
321 Py_FatalError("Inconsistent interned unicode string state.");
322 }
323
Guido van Rossum604ddf82001-12-06 20:03:56 +0000324 if (PyUnicode_CheckExact(unicode) &&
325 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000326 /* Keep-Alive optimization */
327 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000328 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329 unicode->str = NULL;
330 unicode->length = 0;
331 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000332 if (unicode->defenc) {
333 Py_DECREF(unicode->defenc);
334 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000335 }
336 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 *(PyUnicodeObject **)unicode = unicode_freelist;
338 unicode_freelist = unicode;
339 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 }
341 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000342 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000343 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000344 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346}
347
Martin v. Löwis18e16552006-02-15 17:27:45 +0000348int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000349{
350 register PyUnicodeObject *v;
351
352 /* Argument checks */
353 if (unicode == NULL) {
354 PyErr_BadInternalCall();
355 return -1;
356 }
357 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000358 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 PyErr_BadInternalCall();
360 return -1;
361 }
362
363 /* Resizing unicode_empty and single character objects is not
364 possible since these are being shared. We simply return a fresh
365 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000366 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000367 (v == unicode_empty || v->length == 1)) {
368 PyUnicodeObject *w = _PyUnicode_New(length);
369 if (w == NULL)
370 return -1;
371 Py_UNICODE_COPY(w->str, v->str,
372 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000373 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000374 *unicode = (PyObject *)w;
375 return 0;
376 }
377
378 /* Note that we don't have to modify *unicode for unshared Unicode
379 objects, since we can modify them in-place. */
380 return unicode_resize(v, length);
381}
382
383/* Internal API for use in unicodeobject.c only ! */
384#define _PyUnicode_Resize(unicodevar, length) \
385 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
386
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000388 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389{
390 PyUnicodeObject *unicode;
391
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000392 /* If the Unicode data is known at construction time, we can apply
393 some optimizations which share commonly used objects. */
394 if (u != NULL) {
395
396 /* Optimization for empty strings */
397 if (size == 0 && unicode_empty != NULL) {
398 Py_INCREF(unicode_empty);
399 return (PyObject *)unicode_empty;
400 }
401
402 /* Single character Unicode objects in the Latin-1 range are
403 shared when using this constructor */
404 if (size == 1 && *u < 256) {
405 unicode = unicode_latin1[*u];
406 if (!unicode) {
407 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 if (!unicode)
409 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000410 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000411 unicode_latin1[*u] = unicode;
412 }
413 Py_INCREF(unicode);
414 return (PyObject *)unicode;
415 }
416 }
Tim Petersced69f82003-09-16 20:30:58 +0000417
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 unicode = _PyUnicode_New(size);
419 if (!unicode)
420 return NULL;
421
422 /* Copy the Unicode data into the new object */
423 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000424 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425
426 return (PyObject *)unicode;
427}
428
Walter Dörwaldd2034312007-05-18 16:29:38 +0000429PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000430{
431 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000432 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000433 some optimizations which share commonly used objects.
434 Also, this means the input must be UTF-8, so fall back to the
435 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000436 if (u != NULL) {
437
438 /* Optimization for empty strings */
439 if (size == 0 && unicode_empty != NULL) {
440 Py_INCREF(unicode_empty);
441 return (PyObject *)unicode_empty;
442 }
443
Martin v. Löwis9c121062007-08-05 20:26:11 +0000444 /* Single characters are shared when using this constructor.
445 Restrict to ASCII, since the input must be UTF-8. */
446 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000447 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 if (!unicode) {
449 unicode = _PyUnicode_New(1);
450 if (!unicode)
451 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000452 unicode->str[0] = Py_CHARMASK(*u);
453 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000454 }
455 Py_INCREF(unicode);
456 return (PyObject *)unicode;
457 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000458
459 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000460 }
461
Walter Dörwald55507312007-05-18 13:12:10 +0000462 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000463 if (!unicode)
464 return NULL;
465
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000466 return (PyObject *)unicode;
467}
468
Walter Dörwaldd2034312007-05-18 16:29:38 +0000469PyObject *PyUnicode_FromString(const char *u)
470{
471 size_t size = strlen(u);
472 if (size > PY_SSIZE_T_MAX) {
473 PyErr_SetString(PyExc_OverflowError, "input too long");
474 return NULL;
475 }
476
477 return PyUnicode_FromStringAndSize(u, size);
478}
479
Guido van Rossumd57fd912000-03-10 22:53:23 +0000480#ifdef HAVE_WCHAR_H
481
482PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000483 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484{
485 PyUnicodeObject *unicode;
486
487 if (w == NULL) {
488 PyErr_BadInternalCall();
489 return NULL;
490 }
491
492 unicode = _PyUnicode_New(size);
493 if (!unicode)
494 return NULL;
495
496 /* Copy the wchar_t data into the new object */
497#ifdef HAVE_USABLE_WCHAR_T
498 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000499#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 {
501 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000502 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000504 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 *u++ = *w++;
506 }
507#endif
508
509 return (PyObject *)unicode;
510}
511
Walter Dörwald346737f2007-05-31 10:44:43 +0000512static void
513makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
514{
515 *fmt++ = '%';
516 if (width) {
517 if (zeropad)
518 *fmt++ = '0';
519 fmt += sprintf(fmt, "%d", width);
520 }
521 if (precision)
522 fmt += sprintf(fmt, ".%d", precision);
523 if (longflag)
524 *fmt++ = 'l';
525 else if (size_tflag) {
526 char *f = PY_FORMAT_SIZE_T;
527 while (*f)
528 *fmt++ = *f++;
529 }
530 *fmt++ = c;
531 *fmt = '\0';
532}
533
Walter Dörwaldd2034312007-05-18 16:29:38 +0000534#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
535
536PyObject *
537PyUnicode_FromFormatV(const char *format, va_list vargs)
538{
539 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000540 Py_ssize_t callcount = 0;
541 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000542 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000543 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000544 int width = 0;
545 int precision = 0;
546 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000547 const char* f;
548 Py_UNICODE *s;
549 PyObject *string;
550 /* used by sprintf */
551 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000552 /* use abuffer instead of buffer, if we need more space
553 * (which can happen if there's a format specifier with width). */
554 char *abuffer = NULL;
555 char *realbuffer;
556 Py_ssize_t abuffersize = 0;
557 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000558 const char *copy;
559
560#ifdef VA_LIST_IS_ARRAY
561 Py_MEMCPY(count, vargs, sizeof(va_list));
562#else
563#ifdef __va_copy
564 __va_copy(count, vargs);
565#else
566 count = vargs;
567#endif
568#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000569 /* step 1: count the number of %S/%R format specifications
570 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
571 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000572 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000573 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000574 ++callcount;
575 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000576 /* step 2: allocate memory for the results of
577 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000578 if (callcount) {
579 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
580 if (!callresults) {
581 PyErr_NoMemory();
582 return NULL;
583 }
584 callresult = callresults;
585 }
586 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000587 for (f = format; *f; f++) {
588 if (*f == '%') {
589 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000590 width = 0;
591 while (isdigit(Py_CHARMASK(*f)))
592 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000593 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
594 ;
595
596 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
597 * they don't affect the amount of space we reserve.
598 */
599 if ((*f == 'l' || *f == 'z') &&
600 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000601 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000602
603 switch (*f) {
604 case 'c':
605 (void)va_arg(count, int);
606 /* fall through... */
607 case '%':
608 n++;
609 break;
610 case 'd': case 'u': case 'i': case 'x':
611 (void) va_arg(count, int);
612 /* 20 bytes is enough to hold a 64-bit
613 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000614 This isn't enough for octal.
615 If a width is specified we need more
616 (which we allocate later). */
617 if (width < 20)
618 width = 20;
619 n += width;
620 if (abuffersize < width)
621 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622 break;
623 case 's':
624 n += strlen(va_arg(count, char*));
625 break;
626 case 'U':
627 {
628 PyObject *obj = va_arg(count, PyObject *);
629 assert(obj && PyUnicode_Check(obj));
630 n += PyUnicode_GET_SIZE(obj);
631 break;
632 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000633 case 'V':
634 {
635 PyObject *obj = va_arg(count, PyObject *);
636 const char *str = va_arg(count, const char *);
637 assert(obj || str);
638 assert(!obj || PyUnicode_Check(obj));
639 if (obj)
640 n += PyUnicode_GET_SIZE(obj);
641 else
642 n += strlen(str);
643 break;
644 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000645 case 'S':
646 {
647 PyObject *obj = va_arg(count, PyObject *);
648 PyObject *str;
649 assert(obj);
650 str = PyObject_Unicode(obj);
651 if (!str)
652 goto fail;
653 n += PyUnicode_GET_SIZE(str);
654 /* Remember the str and switch to the next slot */
655 *callresult++ = str;
656 break;
657 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000658 case 'R':
659 {
660 PyObject *obj = va_arg(count, PyObject *);
661 PyObject *repr;
662 assert(obj);
663 repr = PyObject_Repr(obj);
664 if (!repr)
665 goto fail;
666 n += PyUnicode_GET_SIZE(repr);
667 /* Remember the repr and switch to the next slot */
668 *callresult++ = repr;
669 break;
670 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000671 case 'p':
672 (void) va_arg(count, int);
673 /* maximum 64-bit pointer representation:
674 * 0xffffffffffffffff
675 * so 19 characters is enough.
676 * XXX I count 18 -- what's the extra for?
677 */
678 n += 19;
679 break;
680 default:
681 /* if we stumble upon an unknown
682 formatting code, copy the rest of
683 the format string to the output
684 string. (we cannot just skip the
685 code, since there's no way to know
686 what's in the argument list) */
687 n += strlen(p);
688 goto expand;
689 }
690 } else
691 n++;
692 }
693 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000694 if (abuffersize > 20) {
695 abuffer = PyMem_Malloc(abuffersize);
696 if (!abuffer) {
697 PyErr_NoMemory();
698 goto fail;
699 }
700 realbuffer = abuffer;
701 }
702 else
703 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000704 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000705 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000706 we don't have to resize the string.
707 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000708 string = PyUnicode_FromUnicode(NULL, n);
709 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000710 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711
712 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000713 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000714
715 for (f = format; *f; f++) {
716 if (*f == '%') {
717 const char* p = f++;
718 int longflag = 0;
719 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000720 zeropad = (*f == '0');
721 /* parse the width.precision part */
722 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000723 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000724 width = (width*10) + *f++ - '0';
725 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000726 if (*f == '.') {
727 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000729 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000730 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731 /* handle the long flag, but only for %ld and %lu.
732 others can be added when necessary. */
733 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
734 longflag = 1;
735 ++f;
736 }
737 /* handle the size_t flag. */
738 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
739 size_tflag = 1;
740 ++f;
741 }
742
743 switch (*f) {
744 case 'c':
745 *s++ = va_arg(vargs, int);
746 break;
747 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000748 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000749 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000750 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000751 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000752 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000753 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000754 sprintf(realbuffer, fmt, va_arg(vargs, int));
755 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000756 break;
757 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000758 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000762 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000764 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
765 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 break;
767 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000768 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
769 sprintf(realbuffer, fmt, va_arg(vargs, int));
770 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000771 break;
772 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000773 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
774 sprintf(realbuffer, fmt, va_arg(vargs, int));
775 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000776 break;
777 case 's':
778 p = va_arg(vargs, char*);
779 appendstring(p);
780 break;
781 case 'U':
782 {
783 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000784 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
785 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
786 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000787 break;
788 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000789 case 'V':
790 {
791 PyObject *obj = va_arg(vargs, PyObject *);
792 const char *str = va_arg(vargs, const char *);
793 if (obj) {
794 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
795 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
796 s += size;
797 } else {
798 appendstring(str);
799 }
800 break;
801 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000802 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000803 case 'R':
804 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000805 Py_UNICODE *ucopy;
806 Py_ssize_t usize;
807 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000808 /* unused, since we already have the result */
809 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000810 ucopy = PyUnicode_AS_UNICODE(*callresult);
811 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000812 for (upos = 0; upos<usize;)
813 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000814 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000815 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000816 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000817 ++callresult;
818 break;
819 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820 case 'p':
821 sprintf(buffer, "%p", va_arg(vargs, void*));
822 /* %p is ill-defined: ensure leading 0x. */
823 if (buffer[1] == 'X')
824 buffer[1] = 'x';
825 else if (buffer[1] != 'x') {
826 memmove(buffer+2, buffer, strlen(buffer)+1);
827 buffer[0] = '0';
828 buffer[1] = 'x';
829 }
830 appendstring(buffer);
831 break;
832 case '%':
833 *s++ = '%';
834 break;
835 default:
836 appendstring(p);
837 goto end;
838 }
839 } else
840 *s++ = *f;
841 }
842
843 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000844 if (callresults)
845 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000846 if (abuffer)
847 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000848 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
849 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000850 fail:
851 if (callresults) {
852 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000853 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000854 Py_DECREF(*callresult2);
855 ++callresult2;
856 }
857 PyMem_Free(callresults);
858 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000859 if (abuffer)
860 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000861 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000862}
863
864#undef appendstring
865
866PyObject *
867PyUnicode_FromFormat(const char *format, ...)
868{
869 PyObject* ret;
870 va_list vargs;
871
872#ifdef HAVE_STDARG_PROTOTYPES
873 va_start(vargs, format);
874#else
875 va_start(vargs);
876#endif
877 ret = PyUnicode_FromFormatV(format, vargs);
878 va_end(vargs);
879 return ret;
880}
881
Martin v. Löwis18e16552006-02-15 17:27:45 +0000882Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
883 wchar_t *w,
884 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885{
886 if (unicode == NULL) {
887 PyErr_BadInternalCall();
888 return -1;
889 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000890
891 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000893 size = PyUnicode_GET_SIZE(unicode) + 1;
894
Guido van Rossumd57fd912000-03-10 22:53:23 +0000895#ifdef HAVE_USABLE_WCHAR_T
896 memcpy(w, unicode->str, size * sizeof(wchar_t));
897#else
898 {
899 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000900 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000901 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000902 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000903 *w++ = *u++;
904 }
905#endif
906
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000907 if (size > PyUnicode_GET_SIZE(unicode))
908 return PyUnicode_GET_SIZE(unicode);
909 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000910 return size;
911}
912
913#endif
914
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000915PyObject *PyUnicode_FromOrdinal(int ordinal)
916{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000917 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000918
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919 if (ordinal < 0 || ordinal > 0x10ffff) {
920 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000921 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000922 return NULL;
923 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000924
925#ifndef Py_UNICODE_WIDE
926 if (ordinal > 0xffff) {
927 ordinal -= 0x10000;
928 s[0] = 0xD800 | (ordinal >> 10);
929 s[1] = 0xDC00 | (ordinal & 0x3FF);
930 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000931 }
932#endif
933
Hye-Shik Chang40574832004-04-06 07:24:51 +0000934 s[0] = (Py_UNICODE)ordinal;
935 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000936}
937
Guido van Rossumd57fd912000-03-10 22:53:23 +0000938PyObject *PyUnicode_FromObject(register PyObject *obj)
939{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000940 /* XXX Perhaps we should make this API an alias of
941 PyObject_Unicode() instead ?! */
942 if (PyUnicode_CheckExact(obj)) {
943 Py_INCREF(obj);
944 return obj;
945 }
946 if (PyUnicode_Check(obj)) {
947 /* For a Unicode subtype that's not a Unicode object,
948 return a true Unicode object with the same data. */
949 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
950 PyUnicode_GET_SIZE(obj));
951 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000952 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
953}
954
955PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
956 const char *encoding,
957 const char *errors)
958{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000959 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000960 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000961 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000962
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963 if (obj == NULL) {
964 PyErr_BadInternalCall();
965 return NULL;
966 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000967
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000968 if (PyUnicode_Check(obj)) {
969 PyErr_SetString(PyExc_TypeError,
970 "decoding Unicode is not supported");
971 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000972 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000973
974 /* Coerce object */
975 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000976 s = PyString_AS_STRING(obj);
977 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000978 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000979 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
980 /* Overwrite the error message with something more useful in
981 case of a TypeError. */
982 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000983 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984 "coercing to Unicode: need string or buffer, "
985 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000986 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000987 goto onError;
988 }
Tim Petersced69f82003-09-16 20:30:58 +0000989
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000990 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000991 if (len == 0) {
992 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000993 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000994 }
Tim Petersced69f82003-09-16 20:30:58 +0000995 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000996 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000997
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000998 return v;
999
1000 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001001 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001002}
1003
1004PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001005 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001006 const char *encoding,
1007 const char *errors)
1008{
1009 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001010
1011 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001012 encoding = PyUnicode_GetDefaultEncoding();
1013
1014 /* Shortcuts for common default encodings */
1015 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001016 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001017 else if (strcmp(encoding, "latin-1") == 0)
1018 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001019#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1020 else if (strcmp(encoding, "mbcs") == 0)
1021 return PyUnicode_DecodeMBCS(s, size, errors);
1022#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001023 else if (strcmp(encoding, "ascii") == 0)
1024 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025
1026 /* Decode via the codec registry */
1027 buffer = PyBuffer_FromMemory((void *)s, size);
1028 if (buffer == NULL)
1029 goto onError;
1030 unicode = PyCodec_Decode(buffer, encoding, errors);
1031 if (unicode == NULL)
1032 goto onError;
1033 if (!PyUnicode_Check(unicode)) {
1034 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001035 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001036 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037 Py_DECREF(unicode);
1038 goto onError;
1039 }
1040 Py_DECREF(buffer);
1041 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001042
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 onError:
1044 Py_XDECREF(buffer);
1045 return NULL;
1046}
1047
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001048PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1049 const char *encoding,
1050 const char *errors)
1051{
1052 PyObject *v;
1053
1054 if (!PyUnicode_Check(unicode)) {
1055 PyErr_BadArgument();
1056 goto onError;
1057 }
1058
1059 if (encoding == NULL)
1060 encoding = PyUnicode_GetDefaultEncoding();
1061
1062 /* Decode via the codec registry */
1063 v = PyCodec_Decode(unicode, encoding, errors);
1064 if (v == NULL)
1065 goto onError;
1066 return v;
1067
1068 onError:
1069 return NULL;
1070}
1071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001073 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 const char *encoding,
1075 const char *errors)
1076{
1077 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001078
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 unicode = PyUnicode_FromUnicode(s, size);
1080 if (unicode == NULL)
1081 return NULL;
1082 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1083 Py_DECREF(unicode);
1084 return v;
1085}
1086
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001087PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1088 const char *encoding,
1089 const char *errors)
1090{
1091 PyObject *v;
1092
1093 if (!PyUnicode_Check(unicode)) {
1094 PyErr_BadArgument();
1095 goto onError;
1096 }
1097
1098 if (encoding == NULL)
1099 encoding = PyUnicode_GetDefaultEncoding();
1100
1101 /* Encode via the codec registry */
1102 v = PyCodec_Encode(unicode, encoding, errors);
1103 if (v == NULL)
1104 goto onError;
1105 return v;
1106
1107 onError:
1108 return NULL;
1109}
1110
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1112 const char *encoding,
1113 const char *errors)
1114{
1115 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001116
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 if (!PyUnicode_Check(unicode)) {
1118 PyErr_BadArgument();
1119 goto onError;
1120 }
Fred Drakee4315f52000-05-09 19:53:39 +00001121
Tim Petersced69f82003-09-16 20:30:58 +00001122 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001123 encoding = PyUnicode_GetDefaultEncoding();
1124
1125 /* Shortcuts for common default encodings */
1126 if (errors == NULL) {
1127 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001128 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001129 else if (strcmp(encoding, "latin-1") == 0)
1130 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001131#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1132 else if (strcmp(encoding, "mbcs") == 0)
1133 return PyUnicode_AsMBCSString(unicode);
1134#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001135 else if (strcmp(encoding, "ascii") == 0)
1136 return PyUnicode_AsASCIIString(unicode);
1137 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138
1139 /* Encode via the codec registry */
1140 v = PyCodec_Encode(unicode, encoding, errors);
1141 if (v == NULL)
1142 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001143 if (!PyBytes_Check(v)) {
1144 if (PyString_Check(v)) {
1145 /* Old codec, turn it into bytes */
1146 PyObject *b = PyBytes_FromObject(v);
1147 Py_DECREF(v);
1148 return b;
1149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001151 "encoder did not return a bytes object "
1152 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1153 v->ob_type->tp_name,
1154 encoding ? encoding : "NULL",
1155 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 Py_DECREF(v);
1157 goto onError;
1158 }
1159 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001160
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 onError:
1162 return NULL;
1163}
1164
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001165PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1166 const char *errors)
1167{
1168 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001169 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001170 if (v)
1171 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001172 if (errors != NULL)
1173 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001174 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1175 PyUnicode_GET_SIZE(unicode),
1176 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001177 if (!b)
1178 return NULL;
1179 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1180 PyBytes_Size(b));
1181 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001182 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001183 return v;
1184}
1185
Martin v. Löwis5b222132007-06-10 09:51:05 +00001186char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001187PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001188{
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001189 PyObject *str8;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001190 if (!PyUnicode_Check(unicode)) {
1191 PyErr_BadArgument();
1192 return NULL;
1193 }
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001194 str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1195 if (str8 == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001196 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001197 if (psize != NULL)
1198 *psize = PyString_GET_SIZE(str8);
1199 return PyString_AS_STRING(str8);
1200}
1201
1202char*
1203PyUnicode_AsString(PyObject *unicode)
1204{
1205 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001206}
1207
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1209{
1210 if (!PyUnicode_Check(unicode)) {
1211 PyErr_BadArgument();
1212 goto onError;
1213 }
1214 return PyUnicode_AS_UNICODE(unicode);
1215
1216 onError:
1217 return NULL;
1218}
1219
Martin v. Löwis18e16552006-02-15 17:27:45 +00001220Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221{
1222 if (!PyUnicode_Check(unicode)) {
1223 PyErr_BadArgument();
1224 goto onError;
1225 }
1226 return PyUnicode_GET_SIZE(unicode);
1227
1228 onError:
1229 return -1;
1230}
1231
Thomas Wouters78890102000-07-22 19:25:51 +00001232const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001233{
1234 return unicode_default_encoding;
1235}
1236
1237int PyUnicode_SetDefaultEncoding(const char *encoding)
1238{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001239 if (strcmp(encoding, unicode_default_encoding) != 0) {
1240 PyErr_Format(PyExc_ValueError,
1241 "Can only set default encoding to %s",
1242 unicode_default_encoding);
1243 return -1;
1244 }
Fred Drakee4315f52000-05-09 19:53:39 +00001245 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001246}
1247
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001248/* error handling callback helper:
1249 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001250 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001251 and adjust various state variables.
1252 return 0 on success, -1 on error
1253*/
1254
1255static
1256int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1257 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001258 const char **input, const char **inend, Py_ssize_t *startinpos,
1259 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001260 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001261{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001262 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001263
1264 PyObject *restuple = NULL;
1265 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001266 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001267 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001268 Py_ssize_t requiredsize;
1269 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001270 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001271 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001272 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 int res = -1;
1274
1275 if (*errorHandler == NULL) {
1276 *errorHandler = PyCodec_LookupError(errors);
1277 if (*errorHandler == NULL)
1278 goto onError;
1279 }
1280
1281 if (*exceptionObject == NULL) {
1282 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001283 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001284 if (*exceptionObject == NULL)
1285 goto onError;
1286 }
1287 else {
1288 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1289 goto onError;
1290 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1291 goto onError;
1292 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1293 goto onError;
1294 }
1295
1296 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1297 if (restuple == NULL)
1298 goto onError;
1299 if (!PyTuple_Check(restuple)) {
1300 PyErr_Format(PyExc_TypeError, &argparse[4]);
1301 goto onError;
1302 }
1303 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1304 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001305
1306 /* Copy back the bytes variables, which might have been modified by the
1307 callback */
1308 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1309 if (!inputobj)
1310 goto onError;
1311 if (!PyBytes_Check(inputobj)) {
1312 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1313 }
1314 *input = PyBytes_AS_STRING(inputobj);
1315 insize = PyBytes_GET_SIZE(inputobj);
1316 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001317 /* we can DECREF safely, as the exception has another reference,
1318 so the object won't go away. */
1319 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001321 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001322 newpos = insize+newpos;
1323 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001324 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001325 goto onError;
1326 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001327
1328 /* need more space? (at least enough for what we
1329 have+the replacement+the rest of the string (starting
1330 at the new input position), so we won't have to check space
1331 when there are no errors in the rest of the string) */
1332 repptr = PyUnicode_AS_UNICODE(repunicode);
1333 repsize = PyUnicode_GET_SIZE(repunicode);
1334 requiredsize = *outpos + repsize + insize-newpos;
1335 if (requiredsize > outsize) {
1336 if (requiredsize<2*outsize)
1337 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001338 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 goto onError;
1340 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1341 }
1342 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001343 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 Py_UNICODE_COPY(*outptr, repptr, repsize);
1345 *outptr += repsize;
1346 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001348 /* we made it! */
1349 res = 0;
1350
1351 onError:
1352 Py_XDECREF(restuple);
1353 return res;
1354}
1355
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001356/* --- UTF-7 Codec -------------------------------------------------------- */
1357
1358/* see RFC2152 for details */
1359
Tim Petersced69f82003-09-16 20:30:58 +00001360static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001361char utf7_special[128] = {
1362 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1363 encoded:
1364 0 - not special
1365 1 - special
1366 2 - whitespace (optional)
1367 3 - RFC2152 Set O (optional) */
1368 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1369 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1370 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1371 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1372 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1373 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1374 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1375 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1376
1377};
1378
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001379/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1380 warnings about the comparison always being false; since
1381 utf7_special[0] is 1, we can safely make that one comparison
1382 true */
1383
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001384#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001385 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001386 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001387 (encodeO && (utf7_special[(c)] == 3)))
1388
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001389#define B64(n) \
1390 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1391#define B64CHAR(c) \
1392 (isalnum(c) || (c) == '+' || (c) == '/')
1393#define UB64(c) \
1394 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1395 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001396
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001397#define ENCODE(out, ch, bits) \
1398 while (bits >= 6) { \
1399 *out++ = B64(ch >> (bits-6)); \
1400 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001401 }
1402
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001403#define DECODE(out, ch, bits, surrogate) \
1404 while (bits >= 16) { \
1405 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1406 bits -= 16; \
1407 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001408 /* We have already generated an error for the high surrogate \
1409 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001410 surrogate = 0; \
1411 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001412 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001413 it in a 16-bit character */ \
1414 surrogate = 1; \
1415 errmsg = "code pairs are not supported"; \
1416 goto utf7Error; \
1417 } else { \
1418 *out++ = outCh; \
1419 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001420 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001421
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001422PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001423 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001424 const char *errors)
1425{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001426 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001427 Py_ssize_t startinpos;
1428 Py_ssize_t endinpos;
1429 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001430 const char *e;
1431 PyUnicodeObject *unicode;
1432 Py_UNICODE *p;
1433 const char *errmsg = "";
1434 int inShift = 0;
1435 unsigned int bitsleft = 0;
1436 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001437 int surrogate = 0;
1438 PyObject *errorHandler = NULL;
1439 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001440
1441 unicode = _PyUnicode_New(size);
1442 if (!unicode)
1443 return NULL;
1444 if (size == 0)
1445 return (PyObject *)unicode;
1446
1447 p = unicode->str;
1448 e = s + size;
1449
1450 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001451 Py_UNICODE ch;
1452 restart:
1453 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001454
1455 if (inShift) {
1456 if ((ch == '-') || !B64CHAR(ch)) {
1457 inShift = 0;
1458 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001459
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001460 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1461 if (bitsleft >= 6) {
1462 /* The shift sequence has a partial character in it. If
1463 bitsleft < 6 then we could just classify it as padding
1464 but that is not the case here */
1465
1466 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001467 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001468 }
1469 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001470 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001471 here so indicate the potential of a misencoded character. */
1472
1473 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1474 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1475 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001476 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001477 }
1478
1479 if (ch == '-') {
1480 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001481 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001482 inShift = 1;
1483 }
1484 } else if (SPECIAL(ch,0,0)) {
1485 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001486 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001487 } else {
1488 *p++ = ch;
1489 }
1490 } else {
1491 charsleft = (charsleft << 6) | UB64(ch);
1492 bitsleft += 6;
1493 s++;
1494 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1495 }
1496 }
1497 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001498 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001499 s++;
1500 if (s < e && *s == '-') {
1501 s++;
1502 *p++ = '+';
1503 } else
1504 {
1505 inShift = 1;
1506 bitsleft = 0;
1507 }
1508 }
1509 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001510 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001511 errmsg = "unexpected special character";
1512 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001513 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001514 }
1515 else {
1516 *p++ = ch;
1517 s++;
1518 }
1519 continue;
1520 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001521 outpos = p-PyUnicode_AS_UNICODE(unicode);
1522 endinpos = s-starts;
1523 if (unicode_decode_call_errorhandler(
1524 errors, &errorHandler,
1525 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001526 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001527 (PyObject **)&unicode, &outpos, &p))
1528 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001529 }
1530
1531 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001532 outpos = p-PyUnicode_AS_UNICODE(unicode);
1533 endinpos = size;
1534 if (unicode_decode_call_errorhandler(
1535 errors, &errorHandler,
1536 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001537 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001538 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001539 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 if (s < e)
1541 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001542 }
1543
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001544 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001545 goto onError;
1546
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 Py_XDECREF(errorHandler);
1548 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549 return (PyObject *)unicode;
1550
1551onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001552 Py_XDECREF(errorHandler);
1553 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001554 Py_DECREF(unicode);
1555 return NULL;
1556}
1557
1558
1559PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001560 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561 int encodeSetO,
1562 int encodeWhiteSpace,
1563 const char *errors)
1564{
1565 PyObject *v;
1566 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001567 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001568 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001569 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001570 unsigned int bitsleft = 0;
1571 unsigned long charsleft = 0;
1572 char * out;
1573 char * start;
1574
1575 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001576 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577
Walter Dörwald51ab4142007-05-05 14:43:36 +00001578 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001579 if (v == NULL)
1580 return NULL;
1581
Walter Dörwald51ab4142007-05-05 14:43:36 +00001582 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 for (;i < size; ++i) {
1584 Py_UNICODE ch = s[i];
1585
1586 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001587 if (ch == '+') {
1588 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001589 *out++ = '-';
1590 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1591 charsleft = ch;
1592 bitsleft = 16;
1593 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001594 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001596 } else {
1597 *out++ = (char) ch;
1598 }
1599 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001600 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1601 *out++ = B64(charsleft << (6-bitsleft));
1602 charsleft = 0;
1603 bitsleft = 0;
1604 /* Characters not in the BASE64 set implicitly unshift the sequence
1605 so no '-' is required, except if the character is itself a '-' */
1606 if (B64CHAR(ch) || ch == '-') {
1607 *out++ = '-';
1608 }
1609 inShift = 0;
1610 *out++ = (char) ch;
1611 } else {
1612 bitsleft += 16;
1613 charsleft = (charsleft << 16) | ch;
1614 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1615
1616 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001617 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001618 or '-' then the shift sequence will be terminated implicitly and we
1619 don't have to insert a '-'. */
1620
1621 if (bitsleft == 0) {
1622 if (i + 1 < size) {
1623 Py_UNICODE ch2 = s[i+1];
1624
1625 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001626
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001627 } else if (B64CHAR(ch2) || ch2 == '-') {
1628 *out++ = '-';
1629 inShift = 0;
1630 } else {
1631 inShift = 0;
1632 }
1633
1634 }
1635 else {
1636 *out++ = '-';
1637 inShift = 0;
1638 }
1639 }
Tim Petersced69f82003-09-16 20:30:58 +00001640 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001641 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001642 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001643 if (bitsleft) {
1644 *out++= B64(charsleft << (6-bitsleft) );
1645 *out++ = '-';
1646 }
1647
Walter Dörwald51ab4142007-05-05 14:43:36 +00001648 if (PyBytes_Resize(v, out - start)) {
1649 Py_DECREF(v);
1650 return NULL;
1651 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 return v;
1653}
1654
1655#undef SPECIAL
1656#undef B64
1657#undef B64CHAR
1658#undef UB64
1659#undef ENCODE
1660#undef DECODE
1661
Guido van Rossumd57fd912000-03-10 22:53:23 +00001662/* --- UTF-8 Codec -------------------------------------------------------- */
1663
Tim Petersced69f82003-09-16 20:30:58 +00001664static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665char utf8_code_length[256] = {
1666 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1667 illegal prefix. see RFC 2279 for details */
1668 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1669 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1670 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1671 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1672 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1673 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1674 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1675 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1676 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1677 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1678 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1679 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1680 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1681 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1682 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1683 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1684};
1685
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001687 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 const char *errors)
1689{
Walter Dörwald69652032004-09-07 20:24:22 +00001690 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1691}
1692
1693PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001694 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001695 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001696 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001697{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001698 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001700 Py_ssize_t startinpos;
1701 Py_ssize_t endinpos;
1702 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703 const char *e;
1704 PyUnicodeObject *unicode;
1705 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001706 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001707 PyObject *errorHandler = NULL;
1708 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001709
1710 /* Note: size will always be longer than the resulting Unicode
1711 character count */
1712 unicode = _PyUnicode_New(size);
1713 if (!unicode)
1714 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001715 if (size == 0) {
1716 if (consumed)
1717 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001719 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
1721 /* Unpack UTF-8 encoded data */
1722 p = unicode->str;
1723 e = s + size;
1724
1725 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001726 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727
1728 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001729 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 s++;
1731 continue;
1732 }
1733
1734 n = utf8_code_length[ch];
1735
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001736 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001737 if (consumed)
1738 break;
1739 else {
1740 errmsg = "unexpected end of data";
1741 startinpos = s-starts;
1742 endinpos = size;
1743 goto utf8Error;
1744 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746
1747 switch (n) {
1748
1749 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001750 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001751 startinpos = s-starts;
1752 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001753 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
1755 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001756 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001757 startinpos = s-starts;
1758 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001759 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760
1761 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001762 if ((s[1] & 0xc0) != 0x80) {
1763 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001764 startinpos = s-starts;
1765 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001766 goto utf8Error;
1767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001769 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001770 startinpos = s-starts;
1771 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001772 errmsg = "illegal encoding";
1773 goto utf8Error;
1774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001776 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777 break;
1778
1779 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001780 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001781 (s[2] & 0xc0) != 0x80) {
1782 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001783 startinpos = s-starts;
1784 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001785 goto utf8Error;
1786 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001788 if (ch < 0x0800) {
1789 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001790 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001791
1792 XXX For wide builds (UCS-4) we should probably try
1793 to recombine the surrogates into a single code
1794 unit.
1795 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001796 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001797 startinpos = s-starts;
1798 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001799 goto utf8Error;
1800 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001802 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001803 break;
1804
1805 case 4:
1806 if ((s[1] & 0xc0) != 0x80 ||
1807 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001808 (s[3] & 0xc0) != 0x80) {
1809 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 startinpos = s-starts;
1811 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001812 goto utf8Error;
1813 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001814 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1815 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1816 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001817 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001818 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001819 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001820 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001821 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001822 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001823 startinpos = s-starts;
1824 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001825 goto utf8Error;
1826 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001827#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001828 *p++ = (Py_UNICODE)ch;
1829#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001830 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001831
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001832 /* translate from 10000..10FFFF to 0..FFFF */
1833 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001834
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001835 /* high surrogate = top 10 bits added to D800 */
1836 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001837
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001838 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001839 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001840#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 break;
1842
1843 default:
1844 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001845 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001846 startinpos = s-starts;
1847 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001848 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 }
1850 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001851 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001852
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001853 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 outpos = p-PyUnicode_AS_UNICODE(unicode);
1855 if (unicode_decode_call_errorhandler(
1856 errors, &errorHandler,
1857 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001858 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001859 (PyObject **)&unicode, &outpos, &p))
1860 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861 }
Walter Dörwald69652032004-09-07 20:24:22 +00001862 if (consumed)
1863 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864
1865 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001866 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867 goto onError;
1868
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 Py_XDECREF(errorHandler);
1870 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 return (PyObject *)unicode;
1872
1873onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001874 Py_XDECREF(errorHandler);
1875 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876 Py_DECREF(unicode);
1877 return NULL;
1878}
1879
Tim Peters602f7402002-04-27 18:03:26 +00001880/* Allocation strategy: if the string is short, convert into a stack buffer
1881 and allocate exactly as much space needed at the end. Else allocate the
1882 maximum possible needed (4 result bytes per Unicode character), and return
1883 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001884*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001885PyObject *
1886PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001887 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889{
Tim Peters602f7402002-04-27 18:03:26 +00001890#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001891
Martin v. Löwis18e16552006-02-15 17:27:45 +00001892 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001893 PyObject *v; /* result string object */
1894 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001895 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001896 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001897 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001898
Tim Peters602f7402002-04-27 18:03:26 +00001899 assert(s != NULL);
1900 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901
Tim Peters602f7402002-04-27 18:03:26 +00001902 if (size <= MAX_SHORT_UNICHARS) {
1903 /* Write into the stack buffer; nallocated can't overflow.
1904 * At the end, we'll allocate exactly as much heap space as it
1905 * turns out we need.
1906 */
1907 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1908 v = NULL; /* will allocate after we're done */
1909 p = stackbuf;
1910 }
1911 else {
1912 /* Overallocate on the heap, and give the excess back at the end. */
1913 nallocated = size * 4;
1914 if (nallocated / 4 != size) /* overflow! */
1915 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001916 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001917 if (v == NULL)
1918 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001919 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001920 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001921
Tim Peters602f7402002-04-27 18:03:26 +00001922 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001923 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001924
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001925 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001926 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001928
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001930 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001931 *p++ = (char)(0xc0 | (ch >> 6));
1932 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001933 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001934 else {
Tim Peters602f7402002-04-27 18:03:26 +00001935 /* Encode UCS2 Unicode ordinals */
1936 if (ch < 0x10000) {
1937 /* Special case: check for high surrogate */
1938 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1939 Py_UCS4 ch2 = s[i];
1940 /* Check for low surrogate and combine the two to
1941 form a UCS4 value */
1942 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001943 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001944 i++;
1945 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001946 }
Tim Peters602f7402002-04-27 18:03:26 +00001947 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001948 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001949 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001950 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1951 *p++ = (char)(0x80 | (ch & 0x3f));
1952 continue;
1953 }
1954encodeUCS4:
1955 /* Encode UCS4 Unicode ordinals */
1956 *p++ = (char)(0xf0 | (ch >> 18));
1957 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1958 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1959 *p++ = (char)(0x80 | (ch & 0x3f));
1960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001962
Tim Peters602f7402002-04-27 18:03:26 +00001963 if (v == NULL) {
1964 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001965 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001966 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001967 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001968 }
1969 else {
1970 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001971 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001972 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001973 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001976
Tim Peters602f7402002-04-27 18:03:26 +00001977#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978}
1979
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1981{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982 if (!PyUnicode_Check(unicode)) {
1983 PyErr_BadArgument();
1984 return NULL;
1985 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001986 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1987 PyUnicode_GET_SIZE(unicode),
1988 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989}
1990
Walter Dörwald41980ca2007-08-16 21:55:45 +00001991/* --- UTF-32 Codec ------------------------------------------------------- */
1992
1993PyObject *
1994PyUnicode_DecodeUTF32(const char *s,
1995 Py_ssize_t size,
1996 const char *errors,
1997 int *byteorder)
1998{
1999 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2000}
2001
2002PyObject *
2003PyUnicode_DecodeUTF32Stateful(const char *s,
2004 Py_ssize_t size,
2005 const char *errors,
2006 int *byteorder,
2007 Py_ssize_t *consumed)
2008{
2009 const char *starts = s;
2010 Py_ssize_t startinpos;
2011 Py_ssize_t endinpos;
2012 Py_ssize_t outpos;
2013 PyUnicodeObject *unicode;
2014 Py_UNICODE *p;
2015#ifndef Py_UNICODE_WIDE
2016 int i, pairs;
2017#else
2018 const int pairs = 0;
2019#endif
2020 const unsigned char *q, *e;
2021 int bo = 0; /* assume native ordering by default */
2022 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002023 /* Offsets from q for retrieving bytes in the right order. */
2024#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2025 int iorder[] = {0, 1, 2, 3};
2026#else
2027 int iorder[] = {3, 2, 1, 0};
2028#endif
2029 PyObject *errorHandler = NULL;
2030 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002031 /* On narrow builds we split characters outside the BMP into two
2032 codepoints => count how much extra space we need. */
2033#ifndef Py_UNICODE_WIDE
2034 for (i = pairs = 0; i < size/4; i++)
2035 if (((Py_UCS4 *)s)[i] >= 0x10000)
2036 pairs++;
2037#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002038
2039 /* This might be one to much, because of a BOM */
2040 unicode = _PyUnicode_New((size+3)/4+pairs);
2041 if (!unicode)
2042 return NULL;
2043 if (size == 0)
2044 return (PyObject *)unicode;
2045
2046 /* Unpack UTF-32 encoded data */
2047 p = unicode->str;
2048 q = (unsigned char *)s;
2049 e = q + size;
2050
2051 if (byteorder)
2052 bo = *byteorder;
2053
2054 /* Check for BOM marks (U+FEFF) in the input and adjust current
2055 byte order setting accordingly. In native mode, the leading BOM
2056 mark is skipped, in all other modes, it is copied to the output
2057 stream as-is (giving a ZWNBSP character). */
2058 if (bo == 0) {
2059 if (size >= 4) {
2060 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2061 (q[iorder[1]] << 8) | q[iorder[0]];
2062#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2063 if (bom == 0x0000FEFF) {
2064 q += 4;
2065 bo = -1;
2066 }
2067 else if (bom == 0xFFFE0000) {
2068 q += 4;
2069 bo = 1;
2070 }
2071#else
2072 if (bom == 0x0000FEFF) {
2073 q += 4;
2074 bo = 1;
2075 }
2076 else if (bom == 0xFFFE0000) {
2077 q += 4;
2078 bo = -1;
2079 }
2080#endif
2081 }
2082 }
2083
2084 if (bo == -1) {
2085 /* force LE */
2086 iorder[0] = 0;
2087 iorder[1] = 1;
2088 iorder[2] = 2;
2089 iorder[3] = 3;
2090 }
2091 else if (bo == 1) {
2092 /* force BE */
2093 iorder[0] = 3;
2094 iorder[1] = 2;
2095 iorder[2] = 1;
2096 iorder[3] = 0;
2097 }
2098
2099 while (q < e) {
2100 Py_UCS4 ch;
2101 /* remaining bytes at the end? (size should be divisible by 4) */
2102 if (e-q<4) {
2103 if (consumed)
2104 break;
2105 errmsg = "truncated data";
2106 startinpos = ((const char *)q)-starts;
2107 endinpos = ((const char *)e)-starts;
2108 goto utf32Error;
2109 /* The remaining input chars are ignored if the callback
2110 chooses to skip the input */
2111 }
2112 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2113 (q[iorder[1]] << 8) | q[iorder[0]];
2114
2115 if (ch >= 0x110000)
2116 {
2117 errmsg = "codepoint not in range(0x110000)";
2118 startinpos = ((const char *)q)-starts;
2119 endinpos = startinpos+4;
2120 goto utf32Error;
2121 }
2122#ifndef Py_UNICODE_WIDE
2123 if (ch >= 0x10000)
2124 {
2125 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2126 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2127 }
2128 else
2129#endif
2130 *p++ = ch;
2131 q += 4;
2132 continue;
2133 utf32Error:
2134 outpos = p-PyUnicode_AS_UNICODE(unicode);
2135 if (unicode_decode_call_errorhandler(
2136 errors, &errorHandler,
2137 "utf32", errmsg,
2138 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2139 (PyObject **)&unicode, &outpos, &p))
2140 goto onError;
2141 }
2142
2143 if (byteorder)
2144 *byteorder = bo;
2145
2146 if (consumed)
2147 *consumed = (const char *)q-starts;
2148
2149 /* Adjust length */
2150 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2151 goto onError;
2152
2153 Py_XDECREF(errorHandler);
2154 Py_XDECREF(exc);
2155 return (PyObject *)unicode;
2156
2157onError:
2158 Py_DECREF(unicode);
2159 Py_XDECREF(errorHandler);
2160 Py_XDECREF(exc);
2161 return NULL;
2162}
2163
2164PyObject *
2165PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2166 Py_ssize_t size,
2167 const char *errors,
2168 int byteorder)
2169{
2170 PyObject *v;
2171 unsigned char *p;
2172#ifndef Py_UNICODE_WIDE
2173 int i, pairs;
2174#else
2175 const int pairs = 0;
2176#endif
2177 /* Offsets from p for storing byte pairs in the right order. */
2178#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2179 int iorder[] = {0, 1, 2, 3};
2180#else
2181 int iorder[] = {3, 2, 1, 0};
2182#endif
2183
2184#define STORECHAR(CH) \
2185 do { \
2186 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2187 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2188 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2189 p[iorder[0]] = (CH) & 0xff; \
2190 p += 4; \
2191 } while(0)
2192
2193 /* In narrow builds we can output surrogate pairs as one codepoint,
2194 so we need less space. */
2195#ifndef Py_UNICODE_WIDE
2196 for (i = pairs = 0; i < size-1; i++)
2197 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2198 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2199 pairs++;
2200#endif
2201 v = PyBytes_FromStringAndSize(NULL,
2202 4 * (size - pairs + (byteorder == 0)));
2203 if (v == NULL)
2204 return NULL;
2205
2206 p = (unsigned char *)PyBytes_AS_STRING(v);
2207 if (byteorder == 0)
2208 STORECHAR(0xFEFF);
2209 if (size == 0)
2210 return v;
2211
2212 if (byteorder == -1) {
2213 /* force LE */
2214 iorder[0] = 0;
2215 iorder[1] = 1;
2216 iorder[2] = 2;
2217 iorder[3] = 3;
2218 }
2219 else if (byteorder == 1) {
2220 /* force BE */
2221 iorder[0] = 3;
2222 iorder[1] = 2;
2223 iorder[2] = 1;
2224 iorder[3] = 0;
2225 }
2226
2227 while (size-- > 0) {
2228 Py_UCS4 ch = *s++;
2229#ifndef Py_UNICODE_WIDE
2230 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2231 Py_UCS4 ch2 = *s;
2232 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2233 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2234 s++;
2235 size--;
2236 }
2237 }
2238#endif
2239 STORECHAR(ch);
2240 }
2241 return v;
2242#undef STORECHAR
2243}
2244
2245PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2246{
2247 if (!PyUnicode_Check(unicode)) {
2248 PyErr_BadArgument();
2249 return NULL;
2250 }
2251 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2252 PyUnicode_GET_SIZE(unicode),
2253 NULL,
2254 0);
2255}
2256
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257/* --- UTF-16 Codec ------------------------------------------------------- */
2258
Tim Peters772747b2001-08-09 22:21:55 +00002259PyObject *
2260PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002261 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002262 const char *errors,
2263 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264{
Walter Dörwald69652032004-09-07 20:24:22 +00002265 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2266}
2267
2268PyObject *
2269PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002270 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002271 const char *errors,
2272 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002273 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002274{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002275 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002276 Py_ssize_t startinpos;
2277 Py_ssize_t endinpos;
2278 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 PyUnicodeObject *unicode;
2280 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002281 const unsigned char *q, *e;
2282 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002283 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002284 /* Offsets from q for retrieving byte pairs in the right order. */
2285#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2286 int ihi = 1, ilo = 0;
2287#else
2288 int ihi = 0, ilo = 1;
2289#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002290 PyObject *errorHandler = NULL;
2291 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292
2293 /* Note: size will always be longer than the resulting Unicode
2294 character count */
2295 unicode = _PyUnicode_New(size);
2296 if (!unicode)
2297 return NULL;
2298 if (size == 0)
2299 return (PyObject *)unicode;
2300
2301 /* Unpack UTF-16 encoded data */
2302 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002303 q = (unsigned char *)s;
2304 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305
2306 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002307 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002309 /* Check for BOM marks (U+FEFF) in the input and adjust current
2310 byte order setting accordingly. In native mode, the leading BOM
2311 mark is skipped, in all other modes, it is copied to the output
2312 stream as-is (giving a ZWNBSP character). */
2313 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002314 if (size >= 2) {
2315 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002316#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002317 if (bom == 0xFEFF) {
2318 q += 2;
2319 bo = -1;
2320 }
2321 else if (bom == 0xFFFE) {
2322 q += 2;
2323 bo = 1;
2324 }
Tim Petersced69f82003-09-16 20:30:58 +00002325#else
Walter Dörwald69652032004-09-07 20:24:22 +00002326 if (bom == 0xFEFF) {
2327 q += 2;
2328 bo = 1;
2329 }
2330 else if (bom == 0xFFFE) {
2331 q += 2;
2332 bo = -1;
2333 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002334#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002335 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002336 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337
Tim Peters772747b2001-08-09 22:21:55 +00002338 if (bo == -1) {
2339 /* force LE */
2340 ihi = 1;
2341 ilo = 0;
2342 }
2343 else if (bo == 1) {
2344 /* force BE */
2345 ihi = 0;
2346 ilo = 1;
2347 }
2348
2349 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002350 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002351 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002352 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002353 if (consumed)
2354 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002355 errmsg = "truncated data";
2356 startinpos = ((const char *)q)-starts;
2357 endinpos = ((const char *)e)-starts;
2358 goto utf16Error;
2359 /* The remaining input chars are ignored if the callback
2360 chooses to skip the input */
2361 }
2362 ch = (q[ihi] << 8) | q[ilo];
2363
Tim Peters772747b2001-08-09 22:21:55 +00002364 q += 2;
2365
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366 if (ch < 0xD800 || ch > 0xDFFF) {
2367 *p++ = ch;
2368 continue;
2369 }
2370
2371 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002372 if (q >= e) {
2373 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002374 startinpos = (((const char *)q)-2)-starts;
2375 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002376 goto utf16Error;
2377 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002378 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002379 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2380 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002381 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002382#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002383 *p++ = ch;
2384 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002385#else
2386 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002387#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002388 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002389 }
2390 else {
2391 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002392 startinpos = (((const char *)q)-4)-starts;
2393 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002394 goto utf16Error;
2395 }
2396
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002398 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002399 startinpos = (((const char *)q)-2)-starts;
2400 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002401 /* Fall through to report the error */
2402
2403 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002404 outpos = p-PyUnicode_AS_UNICODE(unicode);
2405 if (unicode_decode_call_errorhandler(
2406 errors, &errorHandler,
2407 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002408 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002409 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002410 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002411 }
2412
2413 if (byteorder)
2414 *byteorder = bo;
2415
Walter Dörwald69652032004-09-07 20:24:22 +00002416 if (consumed)
2417 *consumed = (const char *)q-starts;
2418
Guido van Rossumd57fd912000-03-10 22:53:23 +00002419 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002420 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002421 goto onError;
2422
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002423 Py_XDECREF(errorHandler);
2424 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002425 return (PyObject *)unicode;
2426
2427onError:
2428 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002429 Py_XDECREF(errorHandler);
2430 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431 return NULL;
2432}
2433
Tim Peters772747b2001-08-09 22:21:55 +00002434PyObject *
2435PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002436 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002437 const char *errors,
2438 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002439{
2440 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002441 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002442#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002443 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002444#else
2445 const int pairs = 0;
2446#endif
Tim Peters772747b2001-08-09 22:21:55 +00002447 /* Offsets from p for storing byte pairs in the right order. */
2448#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2449 int ihi = 1, ilo = 0;
2450#else
2451 int ihi = 0, ilo = 1;
2452#endif
2453
2454#define STORECHAR(CH) \
2455 do { \
2456 p[ihi] = ((CH) >> 8) & 0xff; \
2457 p[ilo] = (CH) & 0xff; \
2458 p += 2; \
2459 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002461#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002462 for (i = pairs = 0; i < size; i++)
2463 if (s[i] >= 0x10000)
2464 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002465#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002466 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002467 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 if (v == NULL)
2469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470
Walter Dörwald3cc34522007-05-04 10:48:27 +00002471 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002473 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002474 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002475 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002476
2477 if (byteorder == -1) {
2478 /* force LE */
2479 ihi = 1;
2480 ilo = 0;
2481 }
2482 else if (byteorder == 1) {
2483 /* force BE */
2484 ihi = 0;
2485 ilo = 1;
2486 }
2487
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002488 while (size-- > 0) {
2489 Py_UNICODE ch = *s++;
2490 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002491#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002492 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002493 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2494 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002496#endif
Tim Peters772747b2001-08-09 22:21:55 +00002497 STORECHAR(ch);
2498 if (ch2)
2499 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002500 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002502#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503}
2504
2505PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2506{
2507 if (!PyUnicode_Check(unicode)) {
2508 PyErr_BadArgument();
2509 return NULL;
2510 }
2511 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2512 PyUnicode_GET_SIZE(unicode),
2513 NULL,
2514 0);
2515}
2516
2517/* --- Unicode Escape Codec ----------------------------------------------- */
2518
Fredrik Lundh06d12682001-01-24 07:59:11 +00002519static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002520
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002522 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 const char *errors)
2524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002526 Py_ssize_t startinpos;
2527 Py_ssize_t endinpos;
2528 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002529 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002531 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002533 char* message;
2534 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002535 PyObject *errorHandler = NULL;
2536 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002537
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538 /* Escaped strings will always be longer than the resulting
2539 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002540 length after conversion to the true value.
2541 (but if the error callback returns a long replacement string
2542 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 v = _PyUnicode_New(size);
2544 if (v == NULL)
2545 goto onError;
2546 if (size == 0)
2547 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002548
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002551
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552 while (s < end) {
2553 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002554 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002555 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556
2557 /* Non-escape characters are interpreted as Unicode ordinals */
2558 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002559 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560 continue;
2561 }
2562
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564 /* \ - Escapes */
2565 s++;
2566 switch (*s++) {
2567
2568 /* \x escapes */
2569 case '\n': break;
2570 case '\\': *p++ = '\\'; break;
2571 case '\'': *p++ = '\''; break;
2572 case '\"': *p++ = '\"'; break;
2573 case 'b': *p++ = '\b'; break;
2574 case 'f': *p++ = '\014'; break; /* FF */
2575 case 't': *p++ = '\t'; break;
2576 case 'n': *p++ = '\n'; break;
2577 case 'r': *p++ = '\r'; break;
2578 case 'v': *p++ = '\013'; break; /* VT */
2579 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2580
2581 /* \OOO (octal) escapes */
2582 case '0': case '1': case '2': case '3':
2583 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002584 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002586 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002588 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002590 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591 break;
2592
Fredrik Lundhccc74732001-02-18 22:13:49 +00002593 /* hex escapes */
2594 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002596 digits = 2;
2597 message = "truncated \\xXX escape";
2598 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599
Fredrik Lundhccc74732001-02-18 22:13:49 +00002600 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002602 digits = 4;
2603 message = "truncated \\uXXXX escape";
2604 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605
Fredrik Lundhccc74732001-02-18 22:13:49 +00002606 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002607 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002608 digits = 8;
2609 message = "truncated \\UXXXXXXXX escape";
2610 hexescape:
2611 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002612 outpos = p-PyUnicode_AS_UNICODE(v);
2613 if (s+digits>end) {
2614 endinpos = size;
2615 if (unicode_decode_call_errorhandler(
2616 errors, &errorHandler,
2617 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002618 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002619 (PyObject **)&v, &outpos, &p))
2620 goto onError;
2621 goto nextByte;
2622 }
2623 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002624 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002625 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002626 endinpos = (s+i+1)-starts;
2627 if (unicode_decode_call_errorhandler(
2628 errors, &errorHandler,
2629 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002630 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002631 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002632 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002633 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002634 }
2635 chr = (chr<<4) & ~0xF;
2636 if (c >= '0' && c <= '9')
2637 chr += c - '0';
2638 else if (c >= 'a' && c <= 'f')
2639 chr += 10 + c - 'a';
2640 else
2641 chr += 10 + c - 'A';
2642 }
2643 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002644 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002645 /* _decoding_error will have already written into the
2646 target buffer. */
2647 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002648 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002649 /* when we get here, chr is a 32-bit unicode character */
2650 if (chr <= 0xffff)
2651 /* UCS-2 character */
2652 *p++ = (Py_UNICODE) chr;
2653 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002654 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002655 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002656#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002657 *p++ = chr;
2658#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002659 chr -= 0x10000L;
2660 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002661 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002662#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002663 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002664 endinpos = s-starts;
2665 outpos = p-PyUnicode_AS_UNICODE(v);
2666 if (unicode_decode_call_errorhandler(
2667 errors, &errorHandler,
2668 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002669 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002670 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002671 goto onError;
2672 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002673 break;
2674
2675 /* \N{name} */
2676 case 'N':
2677 message = "malformed \\N character escape";
2678 if (ucnhash_CAPI == NULL) {
2679 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002680 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002681 m = PyImport_ImportModule("unicodedata");
2682 if (m == NULL)
2683 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002684 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002685 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002686 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002687 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002688 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002689 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002690 if (ucnhash_CAPI == NULL)
2691 goto ucnhashError;
2692 }
2693 if (*s == '{') {
2694 const char *start = s+1;
2695 /* look for the closing brace */
2696 while (*s != '}' && s < end)
2697 s++;
2698 if (s > start && s < end && *s == '}') {
2699 /* found a name. look it up in the unicode database */
2700 message = "unknown Unicode character name";
2701 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002702 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002703 goto store;
2704 }
2705 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 endinpos = s-starts;
2707 outpos = p-PyUnicode_AS_UNICODE(v);
2708 if (unicode_decode_call_errorhandler(
2709 errors, &errorHandler,
2710 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002711 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002712 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002713 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002714 break;
2715
2716 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002717 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 message = "\\ at end of string";
2719 s--;
2720 endinpos = s-starts;
2721 outpos = p-PyUnicode_AS_UNICODE(v);
2722 if (unicode_decode_call_errorhandler(
2723 errors, &errorHandler,
2724 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002725 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002726 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002727 goto onError;
2728 }
2729 else {
2730 *p++ = '\\';
2731 *p++ = (unsigned char)s[-1];
2732 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002733 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002735 nextByte:
2736 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002738 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002739 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002740 Py_XDECREF(errorHandler);
2741 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002743
Fredrik Lundhccc74732001-02-18 22:13:49 +00002744ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002745 PyErr_SetString(
2746 PyExc_UnicodeError,
2747 "\\N escapes not supported (can't load unicodedata module)"
2748 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002749 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002750 Py_XDECREF(errorHandler);
2751 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002752 return NULL;
2753
Fredrik Lundhccc74732001-02-18 22:13:49 +00002754onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002756 Py_XDECREF(errorHandler);
2757 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 return NULL;
2759}
2760
2761/* Return a Unicode-Escape string version of the Unicode object.
2762
2763 If quotes is true, the string is enclosed in u"" or u'' quotes as
2764 appropriate.
2765
2766*/
2767
Thomas Wouters477c8d52006-05-27 19:21:47 +00002768Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2769 Py_ssize_t size,
2770 Py_UNICODE ch)
2771{
2772 /* like wcschr, but doesn't stop at NULL characters */
2773
2774 while (size-- > 0) {
2775 if (*s == ch)
2776 return s;
2777 s++;
2778 }
2779
2780 return NULL;
2781}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002782
Walter Dörwald79e913e2007-05-12 11:08:06 +00002783static const char *hexdigits = "0123456789abcdef";
2784
2785PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2786 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787{
2788 PyObject *repr;
2789 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790
Thomas Wouters89f507f2006-12-13 04:49:30 +00002791 /* XXX(nnorwitz): rather than over-allocating, it would be
2792 better to choose a different scheme. Perhaps scan the
2793 first N-chars of the string and allocate based on that size.
2794 */
2795 /* Initial allocation is based on the longest-possible unichr
2796 escape.
2797
2798 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2799 unichr, so in this case it's the longest unichr escape. In
2800 narrow (UTF-16) builds this is five chars per source unichr
2801 since there are two unichrs in the surrogate pair, so in narrow
2802 (UTF-16) builds it's not the longest unichr escape.
2803
2804 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2805 so in the narrow (UTF-16) build case it's the longest unichr
2806 escape.
2807 */
2808
Walter Dörwald79e913e2007-05-12 11:08:06 +00002809 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002810#ifdef Py_UNICODE_WIDE
2811 + 10*size
2812#else
2813 + 6*size
2814#endif
2815 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 if (repr == NULL)
2817 return NULL;
2818
Walter Dörwald79e913e2007-05-12 11:08:06 +00002819 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 while (size-- > 0) {
2822 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002823
Walter Dörwald79e913e2007-05-12 11:08:06 +00002824 /* Escape backslashes */
2825 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 *p++ = '\\';
2827 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002828 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002829 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002830
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002831#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002832 /* Map 21-bit characters to '\U00xxxxxx' */
2833 else if (ch >= 0x10000) {
2834 *p++ = '\\';
2835 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002836 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2837 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2838 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2839 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2840 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2841 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2842 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2843 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002844 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002845 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002846#else
2847 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002848 else if (ch >= 0xD800 && ch < 0xDC00) {
2849 Py_UNICODE ch2;
2850 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002851
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002852 ch2 = *s++;
2853 size--;
2854 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2855 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2856 *p++ = '\\';
2857 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002858 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2859 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2860 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2861 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2862 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2863 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2864 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2865 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002866 continue;
2867 }
2868 /* Fall through: isolated surrogates are copied as-is */
2869 s--;
2870 size++;
2871 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002872#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002873
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002875 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876 *p++ = '\\';
2877 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002878 *p++ = hexdigits[(ch >> 12) & 0x000F];
2879 *p++ = hexdigits[(ch >> 8) & 0x000F];
2880 *p++ = hexdigits[(ch >> 4) & 0x000F];
2881 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002883
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002884 /* Map special whitespace to '\t', \n', '\r' */
2885 else if (ch == '\t') {
2886 *p++ = '\\';
2887 *p++ = 't';
2888 }
2889 else if (ch == '\n') {
2890 *p++ = '\\';
2891 *p++ = 'n';
2892 }
2893 else if (ch == '\r') {
2894 *p++ = '\\';
2895 *p++ = 'r';
2896 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002897
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002898 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002899 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002901 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002902 *p++ = hexdigits[(ch >> 4) & 0x000F];
2903 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002904 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002905
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906 /* Copy everything else as-is */
2907 else
2908 *p++ = (char) ch;
2909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910
2911 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002912 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2913 Py_DECREF(repr);
2914 return NULL;
2915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002916 return repr;
2917}
2918
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2920{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002921 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 if (!PyUnicode_Check(unicode)) {
2923 PyErr_BadArgument();
2924 return NULL;
2925 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002926 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2927 PyUnicode_GET_SIZE(unicode));
2928
2929 if (!s)
2930 return NULL;
2931 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2932 PyBytes_GET_SIZE(s));
2933 Py_DECREF(s);
2934 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935}
2936
2937/* --- Raw Unicode Escape Codec ------------------------------------------- */
2938
2939PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002940 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002941 const char *errors)
2942{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002943 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002944 Py_ssize_t startinpos;
2945 Py_ssize_t endinpos;
2946 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002947 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002948 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 const char *end;
2950 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002951 PyObject *errorHandler = NULL;
2952 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002953
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954 /* Escaped strings will always be longer than the resulting
2955 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002956 length after conversion to the true value. (But decoding error
2957 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 v = _PyUnicode_New(size);
2959 if (v == NULL)
2960 goto onError;
2961 if (size == 0)
2962 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002963 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964 end = s + size;
2965 while (s < end) {
2966 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002967 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002969 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970
2971 /* Non-escape characters are interpreted as Unicode ordinals */
2972 if (*s != '\\') {
2973 *p++ = (unsigned char)*s++;
2974 continue;
2975 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002976 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977
2978 /* \u-escapes are only interpreted iff the number of leading
2979 backslashes if odd */
2980 bs = s;
2981 for (;s < end;) {
2982 if (*s != '\\')
2983 break;
2984 *p++ = (unsigned char)*s++;
2985 }
2986 if (((s - bs) & 1) == 0 ||
2987 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002988 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989 continue;
2990 }
2991 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002992 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 s++;
2994
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002995 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002997 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002998 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000 endinpos = s-starts;
3001 if (unicode_decode_call_errorhandler(
3002 errors, &errorHandler,
3003 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003004 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003005 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003007 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008 }
3009 x = (x<<4) & ~0xF;
3010 if (c >= '0' && c <= '9')
3011 x += c - '0';
3012 else if (c >= 'a' && c <= 'f')
3013 x += 10 + c - 'a';
3014 else
3015 x += 10 + c - 'A';
3016 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003017#ifndef Py_UNICODE_WIDE
3018 if (x > 0x10000) {
3019 if (unicode_decode_call_errorhandler(
3020 errors, &errorHandler,
3021 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003022 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003023 (PyObject **)&v, &outpos, &p))
3024 goto onError;
3025 }
3026#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003027 *p++ = x;
3028 nextByte:
3029 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003031 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003032 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 Py_XDECREF(errorHandler);
3034 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003036
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 onError:
3038 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003039 Py_XDECREF(errorHandler);
3040 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 return NULL;
3042}
3043
3044PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003045 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046{
3047 PyObject *repr;
3048 char *p;
3049 char *q;
3050
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003051#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003052 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003053#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003054 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003055#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 if (repr == NULL)
3057 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003058 if (size == 0)
3059 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060
Walter Dörwald711005d2007-05-12 12:03:26 +00003061 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062 while (size-- > 0) {
3063 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003064#ifdef Py_UNICODE_WIDE
3065 /* Map 32-bit characters to '\Uxxxxxxxx' */
3066 if (ch >= 0x10000) {
3067 *p++ = '\\';
3068 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003069 *p++ = hexdigits[(ch >> 28) & 0xf];
3070 *p++ = hexdigits[(ch >> 24) & 0xf];
3071 *p++ = hexdigits[(ch >> 20) & 0xf];
3072 *p++ = hexdigits[(ch >> 16) & 0xf];
3073 *p++ = hexdigits[(ch >> 12) & 0xf];
3074 *p++ = hexdigits[(ch >> 8) & 0xf];
3075 *p++ = hexdigits[(ch >> 4) & 0xf];
3076 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003077 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003078 else
3079#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 /* Map 16-bit characters to '\uxxxx' */
3081 if (ch >= 256) {
3082 *p++ = '\\';
3083 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003084 *p++ = hexdigits[(ch >> 12) & 0xf];
3085 *p++ = hexdigits[(ch >> 8) & 0xf];
3086 *p++ = hexdigits[(ch >> 4) & 0xf];
3087 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 }
3089 /* Copy everything else as-is */
3090 else
3091 *p++ = (char) ch;
3092 }
3093 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003094 if (PyBytes_Resize(repr, p - q)) {
3095 Py_DECREF(repr);
3096 return NULL;
3097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098 return repr;
3099}
3100
3101PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3102{
Walter Dörwald711005d2007-05-12 12:03:26 +00003103 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003105 PyErr_BadArgument();
3106 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003108 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3109 PyUnicode_GET_SIZE(unicode));
3110
3111 if (!s)
3112 return NULL;
3113 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3114 PyBytes_GET_SIZE(s));
3115 Py_DECREF(s);
3116 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117}
3118
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003119/* --- Unicode Internal Codec ------------------------------------------- */
3120
3121PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003122 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003123 const char *errors)
3124{
3125 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003126 Py_ssize_t startinpos;
3127 Py_ssize_t endinpos;
3128 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003129 PyUnicodeObject *v;
3130 Py_UNICODE *p;
3131 const char *end;
3132 const char *reason;
3133 PyObject *errorHandler = NULL;
3134 PyObject *exc = NULL;
3135
Neal Norwitzd43069c2006-01-08 01:12:10 +00003136#ifdef Py_UNICODE_WIDE
3137 Py_UNICODE unimax = PyUnicode_GetMax();
3138#endif
3139
Thomas Wouters89f507f2006-12-13 04:49:30 +00003140 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003141 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3142 if (v == NULL)
3143 goto onError;
3144 if (PyUnicode_GetSize((PyObject *)v) == 0)
3145 return (PyObject *)v;
3146 p = PyUnicode_AS_UNICODE(v);
3147 end = s + size;
3148
3149 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003150 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003151 /* We have to sanity check the raw data, otherwise doom looms for
3152 some malformed UCS-4 data. */
3153 if (
3154 #ifdef Py_UNICODE_WIDE
3155 *p > unimax || *p < 0 ||
3156 #endif
3157 end-s < Py_UNICODE_SIZE
3158 )
3159 {
3160 startinpos = s - starts;
3161 if (end-s < Py_UNICODE_SIZE) {
3162 endinpos = end-starts;
3163 reason = "truncated input";
3164 }
3165 else {
3166 endinpos = s - starts + Py_UNICODE_SIZE;
3167 reason = "illegal code point (> 0x10FFFF)";
3168 }
3169 outpos = p - PyUnicode_AS_UNICODE(v);
3170 if (unicode_decode_call_errorhandler(
3171 errors, &errorHandler,
3172 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003173 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003174 (PyObject **)&v, &outpos, &p)) {
3175 goto onError;
3176 }
3177 }
3178 else {
3179 p++;
3180 s += Py_UNICODE_SIZE;
3181 }
3182 }
3183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003184 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003185 goto onError;
3186 Py_XDECREF(errorHandler);
3187 Py_XDECREF(exc);
3188 return (PyObject *)v;
3189
3190 onError:
3191 Py_XDECREF(v);
3192 Py_XDECREF(errorHandler);
3193 Py_XDECREF(exc);
3194 return NULL;
3195}
3196
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197/* --- Latin-1 Codec ------------------------------------------------------ */
3198
3199PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003200 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201 const char *errors)
3202{
3203 PyUnicodeObject *v;
3204 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003205
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003207 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003208 Py_UNICODE r = *(unsigned char*)s;
3209 return PyUnicode_FromUnicode(&r, 1);
3210 }
3211
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 v = _PyUnicode_New(size);
3213 if (v == NULL)
3214 goto onError;
3215 if (size == 0)
3216 return (PyObject *)v;
3217 p = PyUnicode_AS_UNICODE(v);
3218 while (size-- > 0)
3219 *p++ = (unsigned char)*s++;
3220 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003221
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222 onError:
3223 Py_XDECREF(v);
3224 return NULL;
3225}
3226
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003227/* create or adjust a UnicodeEncodeError */
3228static void make_encode_exception(PyObject **exceptionObject,
3229 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003230 const Py_UNICODE *unicode, Py_ssize_t size,
3231 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 if (*exceptionObject == NULL) {
3235 *exceptionObject = PyUnicodeEncodeError_Create(
3236 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 }
3238 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003239 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3240 goto onError;
3241 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3242 goto onError;
3243 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3244 goto onError;
3245 return;
3246 onError:
3247 Py_DECREF(*exceptionObject);
3248 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 }
3250}
3251
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003252/* raises a UnicodeEncodeError */
3253static void raise_encode_exception(PyObject **exceptionObject,
3254 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003255 const Py_UNICODE *unicode, Py_ssize_t size,
3256 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003257 const char *reason)
3258{
3259 make_encode_exception(exceptionObject,
3260 encoding, unicode, size, startpos, endpos, reason);
3261 if (*exceptionObject != NULL)
3262 PyCodec_StrictErrors(*exceptionObject);
3263}
3264
3265/* error handling callback helper:
3266 build arguments, call the callback and check the arguments,
3267 put the result into newpos and return the replacement string, which
3268 has to be freed by the caller */
3269static PyObject *unicode_encode_call_errorhandler(const char *errors,
3270 PyObject **errorHandler,
3271 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003272 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3273 Py_ssize_t startpos, Py_ssize_t endpos,
3274 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003276 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003277
3278 PyObject *restuple;
3279 PyObject *resunicode;
3280
3281 if (*errorHandler == NULL) {
3282 *errorHandler = PyCodec_LookupError(errors);
3283 if (*errorHandler == NULL)
3284 return NULL;
3285 }
3286
3287 make_encode_exception(exceptionObject,
3288 encoding, unicode, size, startpos, endpos, reason);
3289 if (*exceptionObject == NULL)
3290 return NULL;
3291
3292 restuple = PyObject_CallFunctionObjArgs(
3293 *errorHandler, *exceptionObject, NULL);
3294 if (restuple == NULL)
3295 return NULL;
3296 if (!PyTuple_Check(restuple)) {
3297 PyErr_Format(PyExc_TypeError, &argparse[4]);
3298 Py_DECREF(restuple);
3299 return NULL;
3300 }
3301 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3302 &resunicode, newpos)) {
3303 Py_DECREF(restuple);
3304 return NULL;
3305 }
3306 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003307 *newpos = size+*newpos;
3308 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003309 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003310 Py_DECREF(restuple);
3311 return NULL;
3312 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 Py_INCREF(resunicode);
3314 Py_DECREF(restuple);
3315 return resunicode;
3316}
3317
3318static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003319 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003320 const char *errors,
3321 int limit)
3322{
3323 /* output object */
3324 PyObject *res;
3325 /* pointers to the beginning and end+1 of input */
3326 const Py_UNICODE *startp = p;
3327 const Py_UNICODE *endp = p + size;
3328 /* pointer to the beginning of the unencodable characters */
3329 /* const Py_UNICODE *badp = NULL; */
3330 /* pointer into the output */
3331 char *str;
3332 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003333 Py_ssize_t respos = 0;
3334 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003335 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3336 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 PyObject *errorHandler = NULL;
3338 PyObject *exc = NULL;
3339 /* the following variable is used for caching string comparisons
3340 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3341 int known_errorHandler = -1;
3342
3343 /* allocate enough for a simple encoding without
3344 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003345 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003346 if (res == NULL)
3347 goto onError;
3348 if (size == 0)
3349 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003350 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003351 ressize = size;
3352
3353 while (p<endp) {
3354 Py_UNICODE c = *p;
3355
3356 /* can we encode this? */
3357 if (c<limit) {
3358 /* no overflow check, because we know that the space is enough */
3359 *str++ = (char)c;
3360 ++p;
3361 }
3362 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003363 Py_ssize_t unicodepos = p-startp;
3364 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003365 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003366 Py_ssize_t repsize;
3367 Py_ssize_t newpos;
3368 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003369 Py_UNICODE *uni2;
3370 /* startpos for collecting unencodable chars */
3371 const Py_UNICODE *collstart = p;
3372 const Py_UNICODE *collend = p;
3373 /* find all unecodable characters */
3374 while ((collend < endp) && ((*collend)>=limit))
3375 ++collend;
3376 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3377 if (known_errorHandler==-1) {
3378 if ((errors==NULL) || (!strcmp(errors, "strict")))
3379 known_errorHandler = 1;
3380 else if (!strcmp(errors, "replace"))
3381 known_errorHandler = 2;
3382 else if (!strcmp(errors, "ignore"))
3383 known_errorHandler = 3;
3384 else if (!strcmp(errors, "xmlcharrefreplace"))
3385 known_errorHandler = 4;
3386 else
3387 known_errorHandler = 0;
3388 }
3389 switch (known_errorHandler) {
3390 case 1: /* strict */
3391 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3392 goto onError;
3393 case 2: /* replace */
3394 while (collstart++<collend)
3395 *str++ = '?'; /* fall through */
3396 case 3: /* ignore */
3397 p = collend;
3398 break;
3399 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003400 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401 /* determine replacement size (temporarily (mis)uses p) */
3402 for (p = collstart, repsize = 0; p < collend; ++p) {
3403 if (*p<10)
3404 repsize += 2+1+1;
3405 else if (*p<100)
3406 repsize += 2+2+1;
3407 else if (*p<1000)
3408 repsize += 2+3+1;
3409 else if (*p<10000)
3410 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003411#ifndef Py_UNICODE_WIDE
3412 else
3413 repsize += 2+5+1;
3414#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 else if (*p<100000)
3416 repsize += 2+5+1;
3417 else if (*p<1000000)
3418 repsize += 2+6+1;
3419 else
3420 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003421#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 }
3423 requiredsize = respos+repsize+(endp-collend);
3424 if (requiredsize > ressize) {
3425 if (requiredsize<2*ressize)
3426 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003427 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003429 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 ressize = requiredsize;
3431 }
3432 /* generate replacement (temporarily (mis)uses p) */
3433 for (p = collstart; p < collend; ++p) {
3434 str += sprintf(str, "&#%d;", (int)*p);
3435 }
3436 p = collend;
3437 break;
3438 default:
3439 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3440 encoding, reason, startp, size, &exc,
3441 collstart-startp, collend-startp, &newpos);
3442 if (repunicode == NULL)
3443 goto onError;
3444 /* need more space? (at least enough for what we
3445 have+the replacement+the rest of the string, so
3446 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003447 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003448 repsize = PyUnicode_GET_SIZE(repunicode);
3449 requiredsize = respos+repsize+(endp-collend);
3450 if (requiredsize > ressize) {
3451 if (requiredsize<2*ressize)
3452 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003453 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454 Py_DECREF(repunicode);
3455 goto onError;
3456 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003457 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003458 ressize = requiredsize;
3459 }
3460 /* check if there is anything unencodable in the replacement
3461 and copy it to the output */
3462 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3463 c = *uni2;
3464 if (c >= limit) {
3465 raise_encode_exception(&exc, encoding, startp, size,
3466 unicodepos, unicodepos+1, reason);
3467 Py_DECREF(repunicode);
3468 goto onError;
3469 }
3470 *str = (char)c;
3471 }
3472 p = startp + newpos;
3473 Py_DECREF(repunicode);
3474 }
3475 }
3476 }
3477 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003478 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 if (respos<ressize)
3480 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003481 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 Py_XDECREF(errorHandler);
3483 Py_XDECREF(exc);
3484 return res;
3485
3486 onError:
3487 Py_XDECREF(res);
3488 Py_XDECREF(errorHandler);
3489 Py_XDECREF(exc);
3490 return NULL;
3491}
3492
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003494 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495 const char *errors)
3496{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003498}
3499
3500PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3501{
3502 if (!PyUnicode_Check(unicode)) {
3503 PyErr_BadArgument();
3504 return NULL;
3505 }
3506 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3507 PyUnicode_GET_SIZE(unicode),
3508 NULL);
3509}
3510
3511/* --- 7-bit ASCII Codec -------------------------------------------------- */
3512
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003514 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 const char *errors)
3516{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003517 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 PyUnicodeObject *v;
3519 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003520 Py_ssize_t startinpos;
3521 Py_ssize_t endinpos;
3522 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523 const char *e;
3524 PyObject *errorHandler = NULL;
3525 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003526
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003528 if (size == 1 && *(unsigned char*)s < 128) {
3529 Py_UNICODE r = *(unsigned char*)s;
3530 return PyUnicode_FromUnicode(&r, 1);
3531 }
Tim Petersced69f82003-09-16 20:30:58 +00003532
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533 v = _PyUnicode_New(size);
3534 if (v == NULL)
3535 goto onError;
3536 if (size == 0)
3537 return (PyObject *)v;
3538 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 e = s + size;
3540 while (s < e) {
3541 register unsigned char c = (unsigned char)*s;
3542 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 ++s;
3545 }
3546 else {
3547 startinpos = s-starts;
3548 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003549 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550 if (unicode_decode_call_errorhandler(
3551 errors, &errorHandler,
3552 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003553 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003558 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003559 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003560 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 Py_XDECREF(errorHandler);
3562 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003564
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 onError:
3566 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567 Py_XDECREF(errorHandler);
3568 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569 return NULL;
3570}
3571
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003573 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574 const char *errors)
3575{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577}
3578
3579PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3580{
3581 if (!PyUnicode_Check(unicode)) {
3582 PyErr_BadArgument();
3583 return NULL;
3584 }
3585 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3586 PyUnicode_GET_SIZE(unicode),
3587 NULL);
3588}
3589
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003591
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003592/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003593
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003594#if SIZEOF_INT < SIZEOF_SSIZE_T
3595#define NEED_RETRY
3596#endif
3597
3598/* XXX This code is limited to "true" double-byte encodings, as
3599 a) it assumes an incomplete character consists of a single byte, and
3600 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3601 encodings, see IsDBCSLeadByteEx documentation. */
3602
3603static int is_dbcs_lead_byte(const char *s, int offset)
3604{
3605 const char *curr = s + offset;
3606
3607 if (IsDBCSLeadByte(*curr)) {
3608 const char *prev = CharPrev(s, curr);
3609 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3610 }
3611 return 0;
3612}
3613
3614/*
3615 * Decode MBCS string into unicode object. If 'final' is set, converts
3616 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3617 */
3618static int decode_mbcs(PyUnicodeObject **v,
3619 const char *s, /* MBCS string */
3620 int size, /* sizeof MBCS string */
3621 int final)
3622{
3623 Py_UNICODE *p;
3624 Py_ssize_t n = 0;
3625 int usize = 0;
3626
3627 assert(size >= 0);
3628
3629 /* Skip trailing lead-byte unless 'final' is set */
3630 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3631 --size;
3632
3633 /* First get the size of the result */
3634 if (size > 0) {
3635 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3636 if (usize == 0) {
3637 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3638 return -1;
3639 }
3640 }
3641
3642 if (*v == NULL) {
3643 /* Create unicode object */
3644 *v = _PyUnicode_New(usize);
3645 if (*v == NULL)
3646 return -1;
3647 }
3648 else {
3649 /* Extend unicode object */
3650 n = PyUnicode_GET_SIZE(*v);
3651 if (_PyUnicode_Resize(v, n + usize) < 0)
3652 return -1;
3653 }
3654
3655 /* Do the conversion */
3656 if (size > 0) {
3657 p = PyUnicode_AS_UNICODE(*v) + n;
3658 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3659 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3660 return -1;
3661 }
3662 }
3663
3664 return size;
3665}
3666
3667PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3668 Py_ssize_t size,
3669 const char *errors,
3670 Py_ssize_t *consumed)
3671{
3672 PyUnicodeObject *v = NULL;
3673 int done;
3674
3675 if (consumed)
3676 *consumed = 0;
3677
3678#ifdef NEED_RETRY
3679 retry:
3680 if (size > INT_MAX)
3681 done = decode_mbcs(&v, s, INT_MAX, 0);
3682 else
3683#endif
3684 done = decode_mbcs(&v, s, (int)size, !consumed);
3685
3686 if (done < 0) {
3687 Py_XDECREF(v);
3688 return NULL;
3689 }
3690
3691 if (consumed)
3692 *consumed += done;
3693
3694#ifdef NEED_RETRY
3695 if (size > INT_MAX) {
3696 s += done;
3697 size -= done;
3698 goto retry;
3699 }
3700#endif
3701
3702 return (PyObject *)v;
3703}
3704
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003705PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003706 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003707 const char *errors)
3708{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003709 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3710}
3711
3712/*
3713 * Convert unicode into string object (MBCS).
3714 * Returns 0 if succeed, -1 otherwise.
3715 */
3716static int encode_mbcs(PyObject **repr,
3717 const Py_UNICODE *p, /* unicode */
3718 int size) /* size of unicode */
3719{
3720 int mbcssize = 0;
3721 Py_ssize_t n = 0;
3722
3723 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003724
3725 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003726 if (size > 0) {
3727 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3728 if (mbcssize == 0) {
3729 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3730 return -1;
3731 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003732 }
3733
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003734 if (*repr == NULL) {
3735 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003736 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003737 if (*repr == NULL)
3738 return -1;
3739 }
3740 else {
3741 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003742 n = PyBytes_Size(*repr);
3743 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003744 return -1;
3745 }
3746
3747 /* Do the conversion */
3748 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003749 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003750 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3751 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3752 return -1;
3753 }
3754 }
3755
3756 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003757}
3758
3759PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003760 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003761 const char *errors)
3762{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003763 PyObject *repr = NULL;
3764 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003765
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003766#ifdef NEED_RETRY
3767 retry:
3768 if (size > INT_MAX)
3769 ret = encode_mbcs(&repr, p, INT_MAX);
3770 else
3771#endif
3772 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003773
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003774 if (ret < 0) {
3775 Py_XDECREF(repr);
3776 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003777 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003778
3779#ifdef NEED_RETRY
3780 if (size > INT_MAX) {
3781 p += INT_MAX;
3782 size -= INT_MAX;
3783 goto retry;
3784 }
3785#endif
3786
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003787 return repr;
3788}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003789
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003790PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3791{
3792 if (!PyUnicode_Check(unicode)) {
3793 PyErr_BadArgument();
3794 return NULL;
3795 }
3796 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3797 PyUnicode_GET_SIZE(unicode),
3798 NULL);
3799}
3800
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003801#undef NEED_RETRY
3802
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003803#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003804
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805/* --- Character Mapping Codec -------------------------------------------- */
3806
Guido van Rossumd57fd912000-03-10 22:53:23 +00003807PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003808 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 PyObject *mapping,
3810 const char *errors)
3811{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003813 Py_ssize_t startinpos;
3814 Py_ssize_t endinpos;
3815 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 PyUnicodeObject *v;
3818 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003819 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 PyObject *errorHandler = NULL;
3821 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003822 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003823 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003824
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 /* Default to Latin-1 */
3826 if (mapping == NULL)
3827 return PyUnicode_DecodeLatin1(s, size, errors);
3828
3829 v = _PyUnicode_New(size);
3830 if (v == NULL)
3831 goto onError;
3832 if (size == 0)
3833 return (PyObject *)v;
3834 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003836 if (PyUnicode_CheckExact(mapping)) {
3837 mapstring = PyUnicode_AS_UNICODE(mapping);
3838 maplen = PyUnicode_GET_SIZE(mapping);
3839 while (s < e) {
3840 unsigned char ch = *s;
3841 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003843 if (ch < maplen)
3844 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003846 if (x == 0xfffe) {
3847 /* undefined mapping */
3848 outpos = p-PyUnicode_AS_UNICODE(v);
3849 startinpos = s-starts;
3850 endinpos = startinpos+1;
3851 if (unicode_decode_call_errorhandler(
3852 errors, &errorHandler,
3853 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003854 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003855 (PyObject **)&v, &outpos, &p)) {
3856 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003857 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003858 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003859 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003860 *p++ = x;
3861 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003862 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003863 }
3864 else {
3865 while (s < e) {
3866 unsigned char ch = *s;
3867 PyObject *w, *x;
3868
3869 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3870 w = PyInt_FromLong((long)ch);
3871 if (w == NULL)
3872 goto onError;
3873 x = PyObject_GetItem(mapping, w);
3874 Py_DECREF(w);
3875 if (x == NULL) {
3876 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3877 /* No mapping found means: mapping is undefined. */
3878 PyErr_Clear();
3879 x = Py_None;
3880 Py_INCREF(x);
3881 } else
3882 goto onError;
3883 }
3884
3885 /* Apply mapping */
3886 if (PyInt_Check(x)) {
3887 long value = PyInt_AS_LONG(x);
3888 if (value < 0 || value > 65535) {
3889 PyErr_SetString(PyExc_TypeError,
3890 "character mapping must be in range(65536)");
3891 Py_DECREF(x);
3892 goto onError;
3893 }
3894 *p++ = (Py_UNICODE)value;
3895 }
3896 else if (x == Py_None) {
3897 /* undefined mapping */
3898 outpos = p-PyUnicode_AS_UNICODE(v);
3899 startinpos = s-starts;
3900 endinpos = startinpos+1;
3901 if (unicode_decode_call_errorhandler(
3902 errors, &errorHandler,
3903 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003904 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003905 (PyObject **)&v, &outpos, &p)) {
3906 Py_DECREF(x);
3907 goto onError;
3908 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003909 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003910 continue;
3911 }
3912 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003913 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003914
3915 if (targetsize == 1)
3916 /* 1-1 mapping */
3917 *p++ = *PyUnicode_AS_UNICODE(x);
3918
3919 else if (targetsize > 1) {
3920 /* 1-n mapping */
3921 if (targetsize > extrachars) {
3922 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003923 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3924 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003925 (targetsize << 2);
3926 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003927 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003928 if (_PyUnicode_Resize(&v,
3929 PyUnicode_GET_SIZE(v) + needed) < 0) {
3930 Py_DECREF(x);
3931 goto onError;
3932 }
3933 p = PyUnicode_AS_UNICODE(v) + oldpos;
3934 }
3935 Py_UNICODE_COPY(p,
3936 PyUnicode_AS_UNICODE(x),
3937 targetsize);
3938 p += targetsize;
3939 extrachars -= targetsize;
3940 }
3941 /* 1-0 mapping: skip the character */
3942 }
3943 else {
3944 /* wrong return value */
3945 PyErr_SetString(PyExc_TypeError,
3946 "character mapping must return integer, None or unicode");
3947 Py_DECREF(x);
3948 goto onError;
3949 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003951 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 }
3954 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003955 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 Py_XDECREF(errorHandler);
3958 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003960
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 Py_XDECREF(errorHandler);
3963 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 Py_XDECREF(v);
3965 return NULL;
3966}
3967
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003968/* Charmap encoding: the lookup table */
3969
3970struct encoding_map{
3971 PyObject_HEAD
3972 unsigned char level1[32];
3973 int count2, count3;
3974 unsigned char level23[1];
3975};
3976
3977static PyObject*
3978encoding_map_size(PyObject *obj, PyObject* args)
3979{
3980 struct encoding_map *map = (struct encoding_map*)obj;
3981 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3982 128*map->count3);
3983}
3984
3985static PyMethodDef encoding_map_methods[] = {
3986 {"size", encoding_map_size, METH_NOARGS,
3987 PyDoc_STR("Return the size (in bytes) of this object") },
3988 { 0 }
3989};
3990
3991static void
3992encoding_map_dealloc(PyObject* o)
3993{
3994 PyObject_FREE(o);
3995}
3996
3997static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003998 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003999 "EncodingMap", /*tp_name*/
4000 sizeof(struct encoding_map), /*tp_basicsize*/
4001 0, /*tp_itemsize*/
4002 /* methods */
4003 encoding_map_dealloc, /*tp_dealloc*/
4004 0, /*tp_print*/
4005 0, /*tp_getattr*/
4006 0, /*tp_setattr*/
4007 0, /*tp_compare*/
4008 0, /*tp_repr*/
4009 0, /*tp_as_number*/
4010 0, /*tp_as_sequence*/
4011 0, /*tp_as_mapping*/
4012 0, /*tp_hash*/
4013 0, /*tp_call*/
4014 0, /*tp_str*/
4015 0, /*tp_getattro*/
4016 0, /*tp_setattro*/
4017 0, /*tp_as_buffer*/
4018 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4019 0, /*tp_doc*/
4020 0, /*tp_traverse*/
4021 0, /*tp_clear*/
4022 0, /*tp_richcompare*/
4023 0, /*tp_weaklistoffset*/
4024 0, /*tp_iter*/
4025 0, /*tp_iternext*/
4026 encoding_map_methods, /*tp_methods*/
4027 0, /*tp_members*/
4028 0, /*tp_getset*/
4029 0, /*tp_base*/
4030 0, /*tp_dict*/
4031 0, /*tp_descr_get*/
4032 0, /*tp_descr_set*/
4033 0, /*tp_dictoffset*/
4034 0, /*tp_init*/
4035 0, /*tp_alloc*/
4036 0, /*tp_new*/
4037 0, /*tp_free*/
4038 0, /*tp_is_gc*/
4039};
4040
4041PyObject*
4042PyUnicode_BuildEncodingMap(PyObject* string)
4043{
4044 Py_UNICODE *decode;
4045 PyObject *result;
4046 struct encoding_map *mresult;
4047 int i;
4048 int need_dict = 0;
4049 unsigned char level1[32];
4050 unsigned char level2[512];
4051 unsigned char *mlevel1, *mlevel2, *mlevel3;
4052 int count2 = 0, count3 = 0;
4053
4054 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4055 PyErr_BadArgument();
4056 return NULL;
4057 }
4058 decode = PyUnicode_AS_UNICODE(string);
4059 memset(level1, 0xFF, sizeof level1);
4060 memset(level2, 0xFF, sizeof level2);
4061
4062 /* If there isn't a one-to-one mapping of NULL to \0,
4063 or if there are non-BMP characters, we need to use
4064 a mapping dictionary. */
4065 if (decode[0] != 0)
4066 need_dict = 1;
4067 for (i = 1; i < 256; i++) {
4068 int l1, l2;
4069 if (decode[i] == 0
4070 #ifdef Py_UNICODE_WIDE
4071 || decode[i] > 0xFFFF
4072 #endif
4073 ) {
4074 need_dict = 1;
4075 break;
4076 }
4077 if (decode[i] == 0xFFFE)
4078 /* unmapped character */
4079 continue;
4080 l1 = decode[i] >> 11;
4081 l2 = decode[i] >> 7;
4082 if (level1[l1] == 0xFF)
4083 level1[l1] = count2++;
4084 if (level2[l2] == 0xFF)
4085 level2[l2] = count3++;
4086 }
4087
4088 if (count2 >= 0xFF || count3 >= 0xFF)
4089 need_dict = 1;
4090
4091 if (need_dict) {
4092 PyObject *result = PyDict_New();
4093 PyObject *key, *value;
4094 if (!result)
4095 return NULL;
4096 for (i = 0; i < 256; i++) {
4097 key = value = NULL;
4098 key = PyInt_FromLong(decode[i]);
4099 value = PyInt_FromLong(i);
4100 if (!key || !value)
4101 goto failed1;
4102 if (PyDict_SetItem(result, key, value) == -1)
4103 goto failed1;
4104 Py_DECREF(key);
4105 Py_DECREF(value);
4106 }
4107 return result;
4108 failed1:
4109 Py_XDECREF(key);
4110 Py_XDECREF(value);
4111 Py_DECREF(result);
4112 return NULL;
4113 }
4114
4115 /* Create a three-level trie */
4116 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4117 16*count2 + 128*count3 - 1);
4118 if (!result)
4119 return PyErr_NoMemory();
4120 PyObject_Init(result, &EncodingMapType);
4121 mresult = (struct encoding_map*)result;
4122 mresult->count2 = count2;
4123 mresult->count3 = count3;
4124 mlevel1 = mresult->level1;
4125 mlevel2 = mresult->level23;
4126 mlevel3 = mresult->level23 + 16*count2;
4127 memcpy(mlevel1, level1, 32);
4128 memset(mlevel2, 0xFF, 16*count2);
4129 memset(mlevel3, 0, 128*count3);
4130 count3 = 0;
4131 for (i = 1; i < 256; i++) {
4132 int o1, o2, o3, i2, i3;
4133 if (decode[i] == 0xFFFE)
4134 /* unmapped character */
4135 continue;
4136 o1 = decode[i]>>11;
4137 o2 = (decode[i]>>7) & 0xF;
4138 i2 = 16*mlevel1[o1] + o2;
4139 if (mlevel2[i2] == 0xFF)
4140 mlevel2[i2] = count3++;
4141 o3 = decode[i] & 0x7F;
4142 i3 = 128*mlevel2[i2] + o3;
4143 mlevel3[i3] = i;
4144 }
4145 return result;
4146}
4147
4148static int
4149encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4150{
4151 struct encoding_map *map = (struct encoding_map*)mapping;
4152 int l1 = c>>11;
4153 int l2 = (c>>7) & 0xF;
4154 int l3 = c & 0x7F;
4155 int i;
4156
4157#ifdef Py_UNICODE_WIDE
4158 if (c > 0xFFFF) {
4159 return -1;
4160 }
4161#endif
4162 if (c == 0)
4163 return 0;
4164 /* level 1*/
4165 i = map->level1[l1];
4166 if (i == 0xFF) {
4167 return -1;
4168 }
4169 /* level 2*/
4170 i = map->level23[16*i+l2];
4171 if (i == 0xFF) {
4172 return -1;
4173 }
4174 /* level 3 */
4175 i = map->level23[16*map->count2 + 128*i + l3];
4176 if (i == 0) {
4177 return -1;
4178 }
4179 return i;
4180}
4181
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182/* Lookup the character ch in the mapping. If the character
4183 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004184 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 PyObject *w = PyInt_FromLong((long)c);
4188 PyObject *x;
4189
4190 if (w == NULL)
4191 return NULL;
4192 x = PyObject_GetItem(mapping, w);
4193 Py_DECREF(w);
4194 if (x == NULL) {
4195 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4196 /* No mapping found means: mapping is undefined. */
4197 PyErr_Clear();
4198 x = Py_None;
4199 Py_INCREF(x);
4200 return x;
4201 } else
4202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004203 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004204 else if (x == Py_None)
4205 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 else if (PyInt_Check(x)) {
4207 long value = PyInt_AS_LONG(x);
4208 if (value < 0 || value > 255) {
4209 PyErr_SetString(PyExc_TypeError,
4210 "character mapping must be in range(256)");
4211 Py_DECREF(x);
4212 return NULL;
4213 }
4214 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216 else if (PyString_Check(x))
4217 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004220 PyErr_Format(PyExc_TypeError,
4221 "character mapping must return integer, None or str8, not %.400s",
4222 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 Py_DECREF(x);
4224 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225 }
4226}
4227
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004228static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004229charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004230{
Walter Dörwald827b0552007-05-12 13:23:53 +00004231 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004232 /* exponentially overallocate to minimize reallocations */
4233 if (requiredsize < 2*outsize)
4234 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004235 if (PyBytes_Resize(outobj, requiredsize)) {
4236 Py_DECREF(outobj);
4237 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004238 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004239 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004240}
4241
4242typedef enum charmapencode_result {
4243 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4244}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004246 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 space is available. Return a new reference to the object that
4248 was put in the output buffer, or Py_None, if the mapping was undefined
4249 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004250 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004252charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004253 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004255 PyObject *rep;
4256 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004257 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004259 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004260 int res = encoding_map_lookup(c, mapping);
4261 Py_ssize_t requiredsize = *outpos+1;
4262 if (res == -1)
4263 return enc_FAILED;
4264 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004265 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004266 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004267 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004268 outstart[(*outpos)++] = (char)res;
4269 return enc_SUCCESS;
4270 }
4271
4272 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004273 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004274 return enc_EXCEPTION;
4275 else if (rep==Py_None) {
4276 Py_DECREF(rep);
4277 return enc_FAILED;
4278 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004280 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004281 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004282 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004284 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004286 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4288 }
4289 else {
4290 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004291 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4292 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004293 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004294 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004296 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004298 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004299 memcpy(outstart + *outpos, repchars, repsize);
4300 *outpos += repsize;
4301 }
4302 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004303 Py_DECREF(rep);
4304 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305}
4306
4307/* handle an error in PyUnicode_EncodeCharmap
4308 Return 0 on success, -1 on error */
4309static
4310int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004311 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004313 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004314 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315{
4316 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004317 Py_ssize_t repsize;
4318 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 Py_UNICODE *uni2;
4320 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004321 Py_ssize_t collstartpos = *inpos;
4322 Py_ssize_t collendpos = *inpos+1;
4323 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004324 char *encoding = "charmap";
4325 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004326 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 /* find all unencodable characters */
4329 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004330 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004331 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004332 int res = encoding_map_lookup(p[collendpos], mapping);
4333 if (res != -1)
4334 break;
4335 ++collendpos;
4336 continue;
4337 }
4338
4339 rep = charmapencode_lookup(p[collendpos], mapping);
4340 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004341 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004342 else if (rep!=Py_None) {
4343 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004344 break;
4345 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004346 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004347 ++collendpos;
4348 }
4349 /* cache callback name lookup
4350 * (if not done yet, i.e. it's the first error) */
4351 if (*known_errorHandler==-1) {
4352 if ((errors==NULL) || (!strcmp(errors, "strict")))
4353 *known_errorHandler = 1;
4354 else if (!strcmp(errors, "replace"))
4355 *known_errorHandler = 2;
4356 else if (!strcmp(errors, "ignore"))
4357 *known_errorHandler = 3;
4358 else if (!strcmp(errors, "xmlcharrefreplace"))
4359 *known_errorHandler = 4;
4360 else
4361 *known_errorHandler = 0;
4362 }
4363 switch (*known_errorHandler) {
4364 case 1: /* strict */
4365 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4366 return -1;
4367 case 2: /* replace */
4368 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4369 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004370 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 return -1;
4372 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004373 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004374 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4375 return -1;
4376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004377 }
4378 /* fall through */
4379 case 3: /* ignore */
4380 *inpos = collendpos;
4381 break;
4382 case 4: /* xmlcharrefreplace */
4383 /* generate replacement (temporarily (mis)uses p) */
4384 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4385 char buffer[2+29+1+1];
4386 char *cp;
4387 sprintf(buffer, "&#%d;", (int)p[collpos]);
4388 for (cp = buffer; *cp; ++cp) {
4389 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004390 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004392 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4394 return -1;
4395 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 }
4397 }
4398 *inpos = collendpos;
4399 break;
4400 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004401 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 encoding, reason, p, size, exceptionObject,
4403 collstartpos, collendpos, &newpos);
4404 if (repunicode == NULL)
4405 return -1;
4406 /* generate replacement */
4407 repsize = PyUnicode_GET_SIZE(repunicode);
4408 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4409 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004410 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 return -1;
4412 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004413 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4416 return -1;
4417 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 }
4419 *inpos = newpos;
4420 Py_DECREF(repunicode);
4421 }
4422 return 0;
4423}
4424
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004426 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427 PyObject *mapping,
4428 const char *errors)
4429{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430 /* output object */
4431 PyObject *res = NULL;
4432 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004433 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004435 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 PyObject *errorHandler = NULL;
4437 PyObject *exc = NULL;
4438 /* the following variable is used for caching string comparisons
4439 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4440 * 3=ignore, 4=xmlcharrefreplace */
4441 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442
4443 /* Default to Latin-1 */
4444 if (mapping == NULL)
4445 return PyUnicode_EncodeLatin1(p, size, errors);
4446
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 /* allocate enough for a simple encoding without
4448 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004449 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 if (res == NULL)
4451 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004452 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 while (inpos<size) {
4456 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004457 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004458 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004460 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 if (charmap_encoding_error(p, size, &inpos, mapping,
4462 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004463 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004464 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004465 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004466 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468 else
4469 /* done with this character => adjust input position */
4470 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004474 if (respos<PyBytes_GET_SIZE(res)) {
4475 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 goto onError;
4477 }
4478 Py_XDECREF(exc);
4479 Py_XDECREF(errorHandler);
4480 return res;
4481
4482 onError:
4483 Py_XDECREF(res);
4484 Py_XDECREF(exc);
4485 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 return NULL;
4487}
4488
4489PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4490 PyObject *mapping)
4491{
4492 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4493 PyErr_BadArgument();
4494 return NULL;
4495 }
4496 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4497 PyUnicode_GET_SIZE(unicode),
4498 mapping,
4499 NULL);
4500}
4501
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004502/* create or adjust a UnicodeTranslateError */
4503static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004504 const Py_UNICODE *unicode, Py_ssize_t size,
4505 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508 if (*exceptionObject == NULL) {
4509 *exceptionObject = PyUnicodeTranslateError_Create(
4510 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511 }
4512 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4514 goto onError;
4515 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4516 goto onError;
4517 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4518 goto onError;
4519 return;
4520 onError:
4521 Py_DECREF(*exceptionObject);
4522 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523 }
4524}
4525
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526/* raises a UnicodeTranslateError */
4527static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004528 const Py_UNICODE *unicode, Py_ssize_t size,
4529 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 const char *reason)
4531{
4532 make_translate_exception(exceptionObject,
4533 unicode, size, startpos, endpos, reason);
4534 if (*exceptionObject != NULL)
4535 PyCodec_StrictErrors(*exceptionObject);
4536}
4537
4538/* error handling callback helper:
4539 build arguments, call the callback and check the arguments,
4540 put the result into newpos and return the replacement string, which
4541 has to be freed by the caller */
4542static PyObject *unicode_translate_call_errorhandler(const char *errors,
4543 PyObject **errorHandler,
4544 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004545 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4546 Py_ssize_t startpos, Py_ssize_t endpos,
4547 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004549 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004551 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 PyObject *restuple;
4553 PyObject *resunicode;
4554
4555 if (*errorHandler == NULL) {
4556 *errorHandler = PyCodec_LookupError(errors);
4557 if (*errorHandler == NULL)
4558 return NULL;
4559 }
4560
4561 make_translate_exception(exceptionObject,
4562 unicode, size, startpos, endpos, reason);
4563 if (*exceptionObject == NULL)
4564 return NULL;
4565
4566 restuple = PyObject_CallFunctionObjArgs(
4567 *errorHandler, *exceptionObject, NULL);
4568 if (restuple == NULL)
4569 return NULL;
4570 if (!PyTuple_Check(restuple)) {
4571 PyErr_Format(PyExc_TypeError, &argparse[4]);
4572 Py_DECREF(restuple);
4573 return NULL;
4574 }
4575 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004576 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577 Py_DECREF(restuple);
4578 return NULL;
4579 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004580 if (i_newpos<0)
4581 *newpos = size+i_newpos;
4582 else
4583 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004584 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004585 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004586 Py_DECREF(restuple);
4587 return NULL;
4588 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 Py_INCREF(resunicode);
4590 Py_DECREF(restuple);
4591 return resunicode;
4592}
4593
4594/* Lookup the character ch in the mapping and put the result in result,
4595 which must be decrefed by the caller.
4596 Return 0 on success, -1 on error */
4597static
4598int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4599{
4600 PyObject *w = PyInt_FromLong((long)c);
4601 PyObject *x;
4602
4603 if (w == NULL)
4604 return -1;
4605 x = PyObject_GetItem(mapping, w);
4606 Py_DECREF(w);
4607 if (x == NULL) {
4608 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4609 /* No mapping found means: use 1:1 mapping. */
4610 PyErr_Clear();
4611 *result = NULL;
4612 return 0;
4613 } else
4614 return -1;
4615 }
4616 else if (x == Py_None) {
4617 *result = x;
4618 return 0;
4619 }
4620 else if (PyInt_Check(x)) {
4621 long value = PyInt_AS_LONG(x);
4622 long max = PyUnicode_GetMax();
4623 if (value < 0 || value > max) {
4624 PyErr_Format(PyExc_TypeError,
4625 "character mapping must be in range(0x%lx)", max+1);
4626 Py_DECREF(x);
4627 return -1;
4628 }
4629 *result = x;
4630 return 0;
4631 }
4632 else if (PyUnicode_Check(x)) {
4633 *result = x;
4634 return 0;
4635 }
4636 else {
4637 /* wrong return value */
4638 PyErr_SetString(PyExc_TypeError,
4639 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004640 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 return -1;
4642 }
4643}
4644/* ensure that *outobj is at least requiredsize characters long,
4645if not reallocate and adjust various state variables.
4646Return 0 on success, -1 on error */
4647static
Walter Dörwald4894c302003-10-24 14:25:28 +00004648int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004649 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004651 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004652 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004654 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004656 if (requiredsize < 2 * oldsize)
4657 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004658 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 return -1;
4660 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661 }
4662 return 0;
4663}
4664/* lookup the character, put the result in the output string and adjust
4665 various state variables. Return a new reference to the object that
4666 was put in the output buffer in *result, or Py_None, if the mapping was
4667 undefined (in which case no character was written).
4668 The called must decref result.
4669 Return 0 on success, -1 on error. */
4670static
Walter Dörwald4894c302003-10-24 14:25:28 +00004671int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004672 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004673 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004674{
Walter Dörwald4894c302003-10-24 14:25:28 +00004675 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676 return -1;
4677 if (*res==NULL) {
4678 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004679 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 }
4681 else if (*res==Py_None)
4682 ;
4683 else if (PyInt_Check(*res)) {
4684 /* no overflow check, because we know that the space is enough */
4685 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4686 }
4687 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004688 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 if (repsize==1) {
4690 /* no overflow check, because we know that the space is enough */
4691 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4692 }
4693 else if (repsize!=0) {
4694 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004695 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004696 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004697 repsize - 1;
4698 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 return -1;
4700 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4701 *outp += repsize;
4702 }
4703 }
4704 else
4705 return -1;
4706 return 0;
4707}
4708
4709PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004710 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711 PyObject *mapping,
4712 const char *errors)
4713{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004714 /* output object */
4715 PyObject *res = NULL;
4716 /* pointers to the beginning and end+1 of input */
4717 const Py_UNICODE *startp = p;
4718 const Py_UNICODE *endp = p + size;
4719 /* pointer into the output */
4720 Py_UNICODE *str;
4721 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004722 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 char *reason = "character maps to <undefined>";
4724 PyObject *errorHandler = NULL;
4725 PyObject *exc = NULL;
4726 /* the following variable is used for caching string comparisons
4727 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4728 * 3=ignore, 4=xmlcharrefreplace */
4729 int known_errorHandler = -1;
4730
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 if (mapping == NULL) {
4732 PyErr_BadArgument();
4733 return NULL;
4734 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735
4736 /* allocate enough for a simple 1:1 translation without
4737 replacements, if we need more, we'll resize */
4738 res = PyUnicode_FromUnicode(NULL, size);
4739 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004740 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 return res;
4743 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004745 while (p<endp) {
4746 /* try to encode it */
4747 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004748 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004749 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 goto onError;
4751 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004752 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004753 if (x!=Py_None) /* it worked => adjust input pointer */
4754 ++p;
4755 else { /* untranslatable character */
4756 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004757 Py_ssize_t repsize;
4758 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004759 Py_UNICODE *uni2;
4760 /* startpos for collecting untranslatable chars */
4761 const Py_UNICODE *collstart = p;
4762 const Py_UNICODE *collend = p+1;
4763 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004765 /* find all untranslatable characters */
4766 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004767 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 goto onError;
4769 Py_XDECREF(x);
4770 if (x!=Py_None)
4771 break;
4772 ++collend;
4773 }
4774 /* cache callback name lookup
4775 * (if not done yet, i.e. it's the first error) */
4776 if (known_errorHandler==-1) {
4777 if ((errors==NULL) || (!strcmp(errors, "strict")))
4778 known_errorHandler = 1;
4779 else if (!strcmp(errors, "replace"))
4780 known_errorHandler = 2;
4781 else if (!strcmp(errors, "ignore"))
4782 known_errorHandler = 3;
4783 else if (!strcmp(errors, "xmlcharrefreplace"))
4784 known_errorHandler = 4;
4785 else
4786 known_errorHandler = 0;
4787 }
4788 switch (known_errorHandler) {
4789 case 1: /* strict */
4790 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4791 goto onError;
4792 case 2: /* replace */
4793 /* No need to check for space, this is a 1:1 replacement */
4794 for (coll = collstart; coll<collend; ++coll)
4795 *str++ = '?';
4796 /* fall through */
4797 case 3: /* ignore */
4798 p = collend;
4799 break;
4800 case 4: /* xmlcharrefreplace */
4801 /* generate replacement (temporarily (mis)uses p) */
4802 for (p = collstart; p < collend; ++p) {
4803 char buffer[2+29+1+1];
4804 char *cp;
4805 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004806 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4808 goto onError;
4809 for (cp = buffer; *cp; ++cp)
4810 *str++ = *cp;
4811 }
4812 p = collend;
4813 break;
4814 default:
4815 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4816 reason, startp, size, &exc,
4817 collstart-startp, collend-startp, &newpos);
4818 if (repunicode == NULL)
4819 goto onError;
4820 /* generate replacement */
4821 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004822 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4824 Py_DECREF(repunicode);
4825 goto onError;
4826 }
4827 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4828 *str++ = *uni2;
4829 p = startp + newpos;
4830 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 }
4832 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834 /* Resize if we allocated to much */
4835 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004836 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004837 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004838 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004839 }
4840 Py_XDECREF(exc);
4841 Py_XDECREF(errorHandler);
4842 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004844 onError:
4845 Py_XDECREF(res);
4846 Py_XDECREF(exc);
4847 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 return NULL;
4849}
4850
4851PyObject *PyUnicode_Translate(PyObject *str,
4852 PyObject *mapping,
4853 const char *errors)
4854{
4855 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004856
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 str = PyUnicode_FromObject(str);
4858 if (str == NULL)
4859 goto onError;
4860 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4861 PyUnicode_GET_SIZE(str),
4862 mapping,
4863 errors);
4864 Py_DECREF(str);
4865 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004866
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867 onError:
4868 Py_XDECREF(str);
4869 return NULL;
4870}
Tim Petersced69f82003-09-16 20:30:58 +00004871
Guido van Rossum9e896b32000-04-05 20:11:21 +00004872/* --- Decimal Encoder ---------------------------------------------------- */
4873
4874int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004875 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004876 char *output,
4877 const char *errors)
4878{
4879 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004880 PyObject *errorHandler = NULL;
4881 PyObject *exc = NULL;
4882 const char *encoding = "decimal";
4883 const char *reason = "invalid decimal Unicode string";
4884 /* the following variable is used for caching string comparisons
4885 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4886 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004887
4888 if (output == NULL) {
4889 PyErr_BadArgument();
4890 return -1;
4891 }
4892
4893 p = s;
4894 end = s + length;
4895 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004897 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004898 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004899 Py_ssize_t repsize;
4900 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004901 Py_UNICODE *uni2;
4902 Py_UNICODE *collstart;
4903 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004904
Guido van Rossum9e896b32000-04-05 20:11:21 +00004905 if (Py_UNICODE_ISSPACE(ch)) {
4906 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004908 continue;
4909 }
4910 decimal = Py_UNICODE_TODECIMAL(ch);
4911 if (decimal >= 0) {
4912 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004913 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004914 continue;
4915 }
Guido van Rossumba477042000-04-06 18:18:10 +00004916 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004917 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004918 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004919 continue;
4920 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004921 /* All other characters are considered unencodable */
4922 collstart = p;
4923 collend = p+1;
4924 while (collend < end) {
4925 if ((0 < *collend && *collend < 256) ||
4926 !Py_UNICODE_ISSPACE(*collend) ||
4927 Py_UNICODE_TODECIMAL(*collend))
4928 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004929 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004930 /* cache callback name lookup
4931 * (if not done yet, i.e. it's the first error) */
4932 if (known_errorHandler==-1) {
4933 if ((errors==NULL) || (!strcmp(errors, "strict")))
4934 known_errorHandler = 1;
4935 else if (!strcmp(errors, "replace"))
4936 known_errorHandler = 2;
4937 else if (!strcmp(errors, "ignore"))
4938 known_errorHandler = 3;
4939 else if (!strcmp(errors, "xmlcharrefreplace"))
4940 known_errorHandler = 4;
4941 else
4942 known_errorHandler = 0;
4943 }
4944 switch (known_errorHandler) {
4945 case 1: /* strict */
4946 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4947 goto onError;
4948 case 2: /* replace */
4949 for (p = collstart; p < collend; ++p)
4950 *output++ = '?';
4951 /* fall through */
4952 case 3: /* ignore */
4953 p = collend;
4954 break;
4955 case 4: /* xmlcharrefreplace */
4956 /* generate replacement (temporarily (mis)uses p) */
4957 for (p = collstart; p < collend; ++p)
4958 output += sprintf(output, "&#%d;", (int)*p);
4959 p = collend;
4960 break;
4961 default:
4962 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4963 encoding, reason, s, length, &exc,
4964 collstart-s, collend-s, &newpos);
4965 if (repunicode == NULL)
4966 goto onError;
4967 /* generate replacement */
4968 repsize = PyUnicode_GET_SIZE(repunicode);
4969 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4970 Py_UNICODE ch = *uni2;
4971 if (Py_UNICODE_ISSPACE(ch))
4972 *output++ = ' ';
4973 else {
4974 decimal = Py_UNICODE_TODECIMAL(ch);
4975 if (decimal >= 0)
4976 *output++ = '0' + decimal;
4977 else if (0 < ch && ch < 256)
4978 *output++ = (char)ch;
4979 else {
4980 Py_DECREF(repunicode);
4981 raise_encode_exception(&exc, encoding,
4982 s, length, collstart-s, collend-s, reason);
4983 goto onError;
4984 }
4985 }
4986 }
4987 p = s + newpos;
4988 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004989 }
4990 }
4991 /* 0-terminate the output string */
4992 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004993 Py_XDECREF(exc);
4994 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004995 return 0;
4996
4997 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004998 Py_XDECREF(exc);
4999 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005000 return -1;
5001}
5002
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003/* --- Helpers ------------------------------------------------------------ */
5004
Eric Smith8c663262007-08-25 02:26:07 +00005005#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005006
5007#include "stringlib/fastsearch.h"
5008
5009#include "stringlib/count.h"
5010#include "stringlib/find.h"
5011#include "stringlib/partition.h"
5012
5013/* helper macro to fixup start/end slice values */
5014#define FIX_START_END(obj) \
5015 if (start < 0) \
5016 start += (obj)->length; \
5017 if (start < 0) \
5018 start = 0; \
5019 if (end > (obj)->length) \
5020 end = (obj)->length; \
5021 if (end < 0) \
5022 end += (obj)->length; \
5023 if (end < 0) \
5024 end = 0;
5025
Martin v. Löwis18e16552006-02-15 17:27:45 +00005026Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005027 PyObject *substr,
5028 Py_ssize_t start,
5029 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005031 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005032 PyUnicodeObject* str_obj;
5033 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005034
Thomas Wouters477c8d52006-05-27 19:21:47 +00005035 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5036 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005038 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5039 if (!sub_obj) {
5040 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041 return -1;
5042 }
Tim Petersced69f82003-09-16 20:30:58 +00005043
Thomas Wouters477c8d52006-05-27 19:21:47 +00005044 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005045
Thomas Wouters477c8d52006-05-27 19:21:47 +00005046 result = stringlib_count(
5047 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5048 );
5049
5050 Py_DECREF(sub_obj);
5051 Py_DECREF(str_obj);
5052
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053 return result;
5054}
5055
Martin v. Löwis18e16552006-02-15 17:27:45 +00005056Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005057 PyObject *sub,
5058 Py_ssize_t start,
5059 Py_ssize_t end,
5060 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005062 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005063
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005065 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005066 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005067 sub = PyUnicode_FromObject(sub);
5068 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005069 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005070 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071 }
Tim Petersced69f82003-09-16 20:30:58 +00005072
Thomas Wouters477c8d52006-05-27 19:21:47 +00005073 if (direction > 0)
5074 result = stringlib_find_slice(
5075 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5076 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5077 start, end
5078 );
5079 else
5080 result = stringlib_rfind_slice(
5081 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5082 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5083 start, end
5084 );
5085
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005087 Py_DECREF(sub);
5088
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 return result;
5090}
5091
Tim Petersced69f82003-09-16 20:30:58 +00005092static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093int tailmatch(PyUnicodeObject *self,
5094 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005095 Py_ssize_t start,
5096 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097 int direction)
5098{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099 if (substring->length == 0)
5100 return 1;
5101
Thomas Wouters477c8d52006-05-27 19:21:47 +00005102 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103
5104 end -= substring->length;
5105 if (end < start)
5106 return 0;
5107
5108 if (direction > 0) {
5109 if (Py_UNICODE_MATCH(self, end, substring))
5110 return 1;
5111 } else {
5112 if (Py_UNICODE_MATCH(self, start, substring))
5113 return 1;
5114 }
5115
5116 return 0;
5117}
5118
Martin v. Löwis18e16552006-02-15 17:27:45 +00005119Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005121 Py_ssize_t start,
5122 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 int direction)
5124{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005125 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005126
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 str = PyUnicode_FromObject(str);
5128 if (str == NULL)
5129 return -1;
5130 substr = PyUnicode_FromObject(substr);
5131 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005132 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133 return -1;
5134 }
Tim Petersced69f82003-09-16 20:30:58 +00005135
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 result = tailmatch((PyUnicodeObject *)str,
5137 (PyUnicodeObject *)substr,
5138 start, end, direction);
5139 Py_DECREF(str);
5140 Py_DECREF(substr);
5141 return result;
5142}
5143
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144/* Apply fixfct filter to the Unicode object self and return a
5145 reference to the modified object */
5146
Tim Petersced69f82003-09-16 20:30:58 +00005147static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148PyObject *fixup(PyUnicodeObject *self,
5149 int (*fixfct)(PyUnicodeObject *s))
5150{
5151
5152 PyUnicodeObject *u;
5153
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005154 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 if (u == NULL)
5156 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005157
5158 Py_UNICODE_COPY(u->str, self->str, self->length);
5159
Tim Peters7a29bd52001-09-12 03:03:31 +00005160 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 /* fixfct should return TRUE if it modified the buffer. If
5162 FALSE, return a reference to the original buffer instead
5163 (to save space, not time) */
5164 Py_INCREF(self);
5165 Py_DECREF(u);
5166 return (PyObject*) self;
5167 }
5168 return (PyObject*) u;
5169}
5170
Tim Petersced69f82003-09-16 20:30:58 +00005171static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172int fixupper(PyUnicodeObject *self)
5173{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005174 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 Py_UNICODE *s = self->str;
5176 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005177
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 while (len-- > 0) {
5179 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005180
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 ch = Py_UNICODE_TOUPPER(*s);
5182 if (ch != *s) {
5183 status = 1;
5184 *s = ch;
5185 }
5186 s++;
5187 }
5188
5189 return status;
5190}
5191
Tim Petersced69f82003-09-16 20:30:58 +00005192static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193int fixlower(PyUnicodeObject *self)
5194{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005195 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 Py_UNICODE *s = self->str;
5197 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005198
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 while (len-- > 0) {
5200 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005201
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 ch = Py_UNICODE_TOLOWER(*s);
5203 if (ch != *s) {
5204 status = 1;
5205 *s = ch;
5206 }
5207 s++;
5208 }
5209
5210 return status;
5211}
5212
Tim Petersced69f82003-09-16 20:30:58 +00005213static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214int fixswapcase(PyUnicodeObject *self)
5215{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005216 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 Py_UNICODE *s = self->str;
5218 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005219
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 while (len-- > 0) {
5221 if (Py_UNICODE_ISUPPER(*s)) {
5222 *s = Py_UNICODE_TOLOWER(*s);
5223 status = 1;
5224 } else if (Py_UNICODE_ISLOWER(*s)) {
5225 *s = Py_UNICODE_TOUPPER(*s);
5226 status = 1;
5227 }
5228 s++;
5229 }
5230
5231 return status;
5232}
5233
Tim Petersced69f82003-09-16 20:30:58 +00005234static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235int fixcapitalize(PyUnicodeObject *self)
5236{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005237 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005238 Py_UNICODE *s = self->str;
5239 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005240
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005241 if (len == 0)
5242 return 0;
5243 if (Py_UNICODE_ISLOWER(*s)) {
5244 *s = Py_UNICODE_TOUPPER(*s);
5245 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005247 s++;
5248 while (--len > 0) {
5249 if (Py_UNICODE_ISUPPER(*s)) {
5250 *s = Py_UNICODE_TOLOWER(*s);
5251 status = 1;
5252 }
5253 s++;
5254 }
5255 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256}
5257
5258static
5259int fixtitle(PyUnicodeObject *self)
5260{
5261 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5262 register Py_UNICODE *e;
5263 int previous_is_cased;
5264
5265 /* Shortcut for single character strings */
5266 if (PyUnicode_GET_SIZE(self) == 1) {
5267 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5268 if (*p != ch) {
5269 *p = ch;
5270 return 1;
5271 }
5272 else
5273 return 0;
5274 }
Tim Petersced69f82003-09-16 20:30:58 +00005275
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276 e = p + PyUnicode_GET_SIZE(self);
5277 previous_is_cased = 0;
5278 for (; p < e; p++) {
5279 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005280
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 if (previous_is_cased)
5282 *p = Py_UNICODE_TOLOWER(ch);
5283 else
5284 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005285
5286 if (Py_UNICODE_ISLOWER(ch) ||
5287 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 Py_UNICODE_ISTITLE(ch))
5289 previous_is_cased = 1;
5290 else
5291 previous_is_cased = 0;
5292 }
5293 return 1;
5294}
5295
Tim Peters8ce9f162004-08-27 01:49:32 +00005296PyObject *
5297PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298{
Tim Peters8ce9f162004-08-27 01:49:32 +00005299 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005300 const Py_UNICODE blank = ' ';
5301 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005302 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005303 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005304 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5305 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005306 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5307 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005308 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005309 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005310 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311
Tim Peters05eba1f2004-08-27 21:32:02 +00005312 fseq = PySequence_Fast(seq, "");
5313 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005314 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005315 }
5316
Tim Peters91879ab2004-08-27 22:35:44 +00005317 /* Grrrr. A codec may be invoked to convert str objects to
5318 * Unicode, and so it's possible to call back into Python code
5319 * during PyUnicode_FromObject(), and so it's possible for a sick
5320 * codec to change the size of fseq (if seq is a list). Therefore
5321 * we have to keep refetching the size -- can't assume seqlen
5322 * is invariant.
5323 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005324 seqlen = PySequence_Fast_GET_SIZE(fseq);
5325 /* If empty sequence, return u"". */
5326 if (seqlen == 0) {
5327 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5328 goto Done;
5329 }
5330 /* If singleton sequence with an exact Unicode, return that. */
5331 if (seqlen == 1) {
5332 item = PySequence_Fast_GET_ITEM(fseq, 0);
5333 if (PyUnicode_CheckExact(item)) {
5334 Py_INCREF(item);
5335 res = (PyUnicodeObject *)item;
5336 goto Done;
5337 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005338 }
5339
Tim Peters05eba1f2004-08-27 21:32:02 +00005340 /* At least two items to join, or one that isn't exact Unicode. */
5341 if (seqlen > 1) {
5342 /* Set up sep and seplen -- they're needed. */
5343 if (separator == NULL) {
5344 sep = &blank;
5345 seplen = 1;
5346 }
5347 else {
5348 internal_separator = PyUnicode_FromObject(separator);
5349 if (internal_separator == NULL)
5350 goto onError;
5351 sep = PyUnicode_AS_UNICODE(internal_separator);
5352 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005353 /* In case PyUnicode_FromObject() mutated seq. */
5354 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005355 }
5356 }
5357
5358 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005359 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005360 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005361 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005362 res_p = PyUnicode_AS_UNICODE(res);
5363 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005364
Tim Peters05eba1f2004-08-27 21:32:02 +00005365 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005366 Py_ssize_t itemlen;
5367 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005368
5369 item = PySequence_Fast_GET_ITEM(fseq, i);
5370 /* Convert item to Unicode. */
5371 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5372 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005373 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005374 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005375 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005376 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005377 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005378 item = PyUnicode_FromObject(item);
5379 if (item == NULL)
5380 goto onError;
5381 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005382
Tim Peters91879ab2004-08-27 22:35:44 +00005383 /* In case PyUnicode_FromObject() mutated seq. */
5384 seqlen = PySequence_Fast_GET_SIZE(fseq);
5385
Tim Peters8ce9f162004-08-27 01:49:32 +00005386 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005388 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005389 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005390 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005391 if (i < seqlen - 1) {
5392 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005393 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005394 goto Overflow;
5395 }
5396 if (new_res_used > res_alloc) {
5397 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005398 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005399 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005400 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005401 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005402 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005403 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005404 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005406 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005407 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005409
5410 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005411 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005412 res_p += itemlen;
5413 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005414 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005415 res_p += seplen;
5416 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005418 res_used = new_res_used;
5419 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005420
Tim Peters05eba1f2004-08-27 21:32:02 +00005421 /* Shrink res to match the used area; this probably can't fail,
5422 * but it's cheap to check.
5423 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005424 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005425 goto onError;
5426
5427 Done:
5428 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005429 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 return (PyObject *)res;
5431
Tim Peters8ce9f162004-08-27 01:49:32 +00005432 Overflow:
5433 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005434 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005435 Py_DECREF(item);
5436 /* fall through */
5437
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005439 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005440 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005441 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 return NULL;
5443}
5444
Tim Petersced69f82003-09-16 20:30:58 +00005445static
5446PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005447 Py_ssize_t left,
5448 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 Py_UNICODE fill)
5450{
5451 PyUnicodeObject *u;
5452
5453 if (left < 0)
5454 left = 0;
5455 if (right < 0)
5456 right = 0;
5457
Tim Peters7a29bd52001-09-12 03:03:31 +00005458 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 Py_INCREF(self);
5460 return self;
5461 }
5462
5463 u = _PyUnicode_New(left + self->length + right);
5464 if (u) {
5465 if (left)
5466 Py_UNICODE_FILL(u->str, fill, left);
5467 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5468 if (right)
5469 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5470 }
5471
5472 return u;
5473}
5474
5475#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005476 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 if (!str) \
5478 goto onError; \
5479 if (PyList_Append(list, str)) { \
5480 Py_DECREF(str); \
5481 goto onError; \
5482 } \
5483 else \
5484 Py_DECREF(str);
5485
5486static
5487PyObject *split_whitespace(PyUnicodeObject *self,
5488 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005489 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005491 register Py_ssize_t i;
5492 register Py_ssize_t j;
5493 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 PyObject *str;
5495
5496 for (i = j = 0; i < len; ) {
5497 /* find a token */
5498 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5499 i++;
5500 j = i;
5501 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5502 i++;
5503 if (j < i) {
5504 if (maxcount-- <= 0)
5505 break;
5506 SPLIT_APPEND(self->str, j, i);
5507 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5508 i++;
5509 j = i;
5510 }
5511 }
5512 if (j < len) {
5513 SPLIT_APPEND(self->str, j, len);
5514 }
5515 return list;
5516
5517 onError:
5518 Py_DECREF(list);
5519 return NULL;
5520}
5521
5522PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005523 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005525 register Py_ssize_t i;
5526 register Py_ssize_t j;
5527 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 PyObject *list;
5529 PyObject *str;
5530 Py_UNICODE *data;
5531
5532 string = PyUnicode_FromObject(string);
5533 if (string == NULL)
5534 return NULL;
5535 data = PyUnicode_AS_UNICODE(string);
5536 len = PyUnicode_GET_SIZE(string);
5537
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 list = PyList_New(0);
5539 if (!list)
5540 goto onError;
5541
5542 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005543 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005544
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005546 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548
5549 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005550 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 if (i < len) {
5552 if (data[i] == '\r' && i + 1 < len &&
5553 data[i+1] == '\n')
5554 i += 2;
5555 else
5556 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005557 if (keepends)
5558 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 }
Guido van Rossum86662912000-04-11 15:38:46 +00005560 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561 j = i;
5562 }
5563 if (j < len) {
5564 SPLIT_APPEND(data, j, len);
5565 }
5566
5567 Py_DECREF(string);
5568 return list;
5569
5570 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005571 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 Py_DECREF(string);
5573 return NULL;
5574}
5575
Tim Petersced69f82003-09-16 20:30:58 +00005576static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577PyObject *split_char(PyUnicodeObject *self,
5578 PyObject *list,
5579 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005580 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005582 register Py_ssize_t i;
5583 register Py_ssize_t j;
5584 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 PyObject *str;
5586
5587 for (i = j = 0; i < len; ) {
5588 if (self->str[i] == ch) {
5589 if (maxcount-- <= 0)
5590 break;
5591 SPLIT_APPEND(self->str, j, i);
5592 i = j = i + 1;
5593 } else
5594 i++;
5595 }
5596 if (j <= len) {
5597 SPLIT_APPEND(self->str, j, len);
5598 }
5599 return list;
5600
5601 onError:
5602 Py_DECREF(list);
5603 return NULL;
5604}
5605
Tim Petersced69f82003-09-16 20:30:58 +00005606static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607PyObject *split_substring(PyUnicodeObject *self,
5608 PyObject *list,
5609 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005610 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005612 register Py_ssize_t i;
5613 register Py_ssize_t j;
5614 Py_ssize_t len = self->length;
5615 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 PyObject *str;
5617
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005618 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 if (Py_UNICODE_MATCH(self, i, substring)) {
5620 if (maxcount-- <= 0)
5621 break;
5622 SPLIT_APPEND(self->str, j, i);
5623 i = j = i + sublen;
5624 } else
5625 i++;
5626 }
5627 if (j <= len) {
5628 SPLIT_APPEND(self->str, j, len);
5629 }
5630 return list;
5631
5632 onError:
5633 Py_DECREF(list);
5634 return NULL;
5635}
5636
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005637static
5638PyObject *rsplit_whitespace(PyUnicodeObject *self,
5639 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005640 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005641{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005642 register Py_ssize_t i;
5643 register Py_ssize_t j;
5644 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005645 PyObject *str;
5646
5647 for (i = j = len - 1; i >= 0; ) {
5648 /* find a token */
5649 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5650 i--;
5651 j = i;
5652 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5653 i--;
5654 if (j > i) {
5655 if (maxcount-- <= 0)
5656 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005657 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005658 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5659 i--;
5660 j = i;
5661 }
5662 }
5663 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005664 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005665 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005666 if (PyList_Reverse(list) < 0)
5667 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005668 return list;
5669
5670 onError:
5671 Py_DECREF(list);
5672 return NULL;
5673}
5674
5675static
5676PyObject *rsplit_char(PyUnicodeObject *self,
5677 PyObject *list,
5678 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005679 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005680{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005681 register Py_ssize_t i;
5682 register Py_ssize_t j;
5683 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005684 PyObject *str;
5685
5686 for (i = j = len - 1; i >= 0; ) {
5687 if (self->str[i] == ch) {
5688 if (maxcount-- <= 0)
5689 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005690 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005691 j = i = i - 1;
5692 } else
5693 i--;
5694 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005695 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005696 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005697 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005698 if (PyList_Reverse(list) < 0)
5699 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005700 return list;
5701
5702 onError:
5703 Py_DECREF(list);
5704 return NULL;
5705}
5706
5707static
5708PyObject *rsplit_substring(PyUnicodeObject *self,
5709 PyObject *list,
5710 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005711 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005712{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005713 register Py_ssize_t i;
5714 register Py_ssize_t j;
5715 Py_ssize_t len = self->length;
5716 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005717 PyObject *str;
5718
5719 for (i = len - sublen, j = len; i >= 0; ) {
5720 if (Py_UNICODE_MATCH(self, i, substring)) {
5721 if (maxcount-- <= 0)
5722 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005723 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005724 j = i;
5725 i -= sublen;
5726 } else
5727 i--;
5728 }
5729 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005730 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005731 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005732 if (PyList_Reverse(list) < 0)
5733 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005734 return list;
5735
5736 onError:
5737 Py_DECREF(list);
5738 return NULL;
5739}
5740
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741#undef SPLIT_APPEND
5742
5743static
5744PyObject *split(PyUnicodeObject *self,
5745 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005746 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747{
5748 PyObject *list;
5749
5750 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005751 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752
5753 list = PyList_New(0);
5754 if (!list)
5755 return NULL;
5756
5757 if (substring == NULL)
5758 return split_whitespace(self,list,maxcount);
5759
5760 else if (substring->length == 1)
5761 return split_char(self,list,substring->str[0],maxcount);
5762
5763 else if (substring->length == 0) {
5764 Py_DECREF(list);
5765 PyErr_SetString(PyExc_ValueError, "empty separator");
5766 return NULL;
5767 }
5768 else
5769 return split_substring(self,list,substring,maxcount);
5770}
5771
Tim Petersced69f82003-09-16 20:30:58 +00005772static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005773PyObject *rsplit(PyUnicodeObject *self,
5774 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005775 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005776{
5777 PyObject *list;
5778
5779 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005780 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005781
5782 list = PyList_New(0);
5783 if (!list)
5784 return NULL;
5785
5786 if (substring == NULL)
5787 return rsplit_whitespace(self,list,maxcount);
5788
5789 else if (substring->length == 1)
5790 return rsplit_char(self,list,substring->str[0],maxcount);
5791
5792 else if (substring->length == 0) {
5793 Py_DECREF(list);
5794 PyErr_SetString(PyExc_ValueError, "empty separator");
5795 return NULL;
5796 }
5797 else
5798 return rsplit_substring(self,list,substring,maxcount);
5799}
5800
5801static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802PyObject *replace(PyUnicodeObject *self,
5803 PyUnicodeObject *str1,
5804 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005805 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806{
5807 PyUnicodeObject *u;
5808
5809 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005810 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811
Thomas Wouters477c8d52006-05-27 19:21:47 +00005812 if (str1->length == str2->length) {
5813 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005814 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005815 if (str1->length == 1) {
5816 /* replace characters */
5817 Py_UNICODE u1, u2;
5818 if (!findchar(self->str, self->length, str1->str[0]))
5819 goto nothing;
5820 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5821 if (!u)
5822 return NULL;
5823 Py_UNICODE_COPY(u->str, self->str, self->length);
5824 u1 = str1->str[0];
5825 u2 = str2->str[0];
5826 for (i = 0; i < u->length; i++)
5827 if (u->str[i] == u1) {
5828 if (--maxcount < 0)
5829 break;
5830 u->str[i] = u2;
5831 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005833 i = fastsearch(
5834 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005836 if (i < 0)
5837 goto nothing;
5838 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5839 if (!u)
5840 return NULL;
5841 Py_UNICODE_COPY(u->str, self->str, self->length);
5842 while (i <= self->length - str1->length)
5843 if (Py_UNICODE_MATCH(self, i, str1)) {
5844 if (--maxcount < 0)
5845 break;
5846 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5847 i += str1->length;
5848 } else
5849 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005852
5853 Py_ssize_t n, i, j, e;
5854 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 Py_UNICODE *p;
5856
5857 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005858 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 if (n > maxcount)
5860 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005861 if (n == 0)
5862 goto nothing;
5863 /* new_size = self->length + n * (str2->length - str1->length)); */
5864 delta = (str2->length - str1->length);
5865 if (delta == 0) {
5866 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005868 product = n * (str2->length - str1->length);
5869 if ((product / (str2->length - str1->length)) != n) {
5870 PyErr_SetString(PyExc_OverflowError,
5871 "replace string is too long");
5872 return NULL;
5873 }
5874 new_size = self->length + product;
5875 if (new_size < 0) {
5876 PyErr_SetString(PyExc_OverflowError,
5877 "replace string is too long");
5878 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 }
5880 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005881 u = _PyUnicode_New(new_size);
5882 if (!u)
5883 return NULL;
5884 i = 0;
5885 p = u->str;
5886 e = self->length - str1->length;
5887 if (str1->length > 0) {
5888 while (n-- > 0) {
5889 /* look for next match */
5890 j = i;
5891 while (j <= e) {
5892 if (Py_UNICODE_MATCH(self, j, str1))
5893 break;
5894 j++;
5895 }
5896 if (j > i) {
5897 if (j > e)
5898 break;
5899 /* copy unchanged part [i:j] */
5900 Py_UNICODE_COPY(p, self->str+i, j-i);
5901 p += j - i;
5902 }
5903 /* copy substitution string */
5904 if (str2->length > 0) {
5905 Py_UNICODE_COPY(p, str2->str, str2->length);
5906 p += str2->length;
5907 }
5908 i = j + str1->length;
5909 }
5910 if (i < self->length)
5911 /* copy tail [i:] */
5912 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5913 } else {
5914 /* interleave */
5915 while (n > 0) {
5916 Py_UNICODE_COPY(p, str2->str, str2->length);
5917 p += str2->length;
5918 if (--n <= 0)
5919 break;
5920 *p++ = self->str[i++];
5921 }
5922 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5923 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005926
5927nothing:
5928 /* nothing to replace; return original string (when possible) */
5929 if (PyUnicode_CheckExact(self)) {
5930 Py_INCREF(self);
5931 return (PyObject *) self;
5932 }
5933 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934}
5935
5936/* --- Unicode Object Methods --------------------------------------------- */
5937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005938PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939"S.title() -> unicode\n\
5940\n\
5941Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005942characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943
5944static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005945unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 return fixup(self, fixtitle);
5948}
5949
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005950PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951"S.capitalize() -> unicode\n\
5952\n\
5953Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005954have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955
5956static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005957unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 return fixup(self, fixcapitalize);
5960}
5961
5962#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005963PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964"S.capwords() -> unicode\n\
5965\n\
5966Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005967normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968
5969static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005970unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971{
5972 PyObject *list;
5973 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005974 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 /* Split into words */
5977 list = split(self, NULL, -1);
5978 if (!list)
5979 return NULL;
5980
5981 /* Capitalize each word */
5982 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5983 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5984 fixcapitalize);
5985 if (item == NULL)
5986 goto onError;
5987 Py_DECREF(PyList_GET_ITEM(list, i));
5988 PyList_SET_ITEM(list, i, item);
5989 }
5990
5991 /* Join the words to form a new string */
5992 item = PyUnicode_Join(NULL, list);
5993
5994onError:
5995 Py_DECREF(list);
5996 return (PyObject *)item;
5997}
5998#endif
5999
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006000/* Argument converter. Coerces to a single unicode character */
6001
6002static int
6003convert_uc(PyObject *obj, void *addr)
6004{
6005 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6006 PyObject *uniobj;
6007 Py_UNICODE *unistr;
6008
6009 uniobj = PyUnicode_FromObject(obj);
6010 if (uniobj == NULL) {
6011 PyErr_SetString(PyExc_TypeError,
6012 "The fill character cannot be converted to Unicode");
6013 return 0;
6014 }
6015 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6016 PyErr_SetString(PyExc_TypeError,
6017 "The fill character must be exactly one character long");
6018 Py_DECREF(uniobj);
6019 return 0;
6020 }
6021 unistr = PyUnicode_AS_UNICODE(uniobj);
6022 *fillcharloc = unistr[0];
6023 Py_DECREF(uniobj);
6024 return 1;
6025}
6026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006027PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006028"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006030Return S centered in a Unicode string of length width. Padding is\n\
6031done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
6033static PyObject *
6034unicode_center(PyUnicodeObject *self, PyObject *args)
6035{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006036 Py_ssize_t marg, left;
6037 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006038 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039
Thomas Woutersde017742006-02-16 19:34:37 +00006040 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 return NULL;
6042
Tim Peters7a29bd52001-09-12 03:03:31 +00006043 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 Py_INCREF(self);
6045 return (PyObject*) self;
6046 }
6047
6048 marg = width - self->length;
6049 left = marg / 2 + (marg & width & 1);
6050
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006051 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052}
6053
Marc-André Lemburge5034372000-08-08 08:04:29 +00006054#if 0
6055
6056/* This code should go into some future Unicode collation support
6057 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006058 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006059
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006060/* speedy UTF-16 code point order comparison */
6061/* gleaned from: */
6062/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6063
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006064static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006065{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006066 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006067 0, 0, 0, 0, 0, 0, 0, 0,
6068 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006069 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006070};
6071
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072static int
6073unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6074{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006075 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006076
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 Py_UNICODE *s1 = str1->str;
6078 Py_UNICODE *s2 = str2->str;
6079
6080 len1 = str1->length;
6081 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006082
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006084 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006085
6086 c1 = *s1++;
6087 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006088
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006089 if (c1 > (1<<11) * 26)
6090 c1 += utf16Fixup[c1>>11];
6091 if (c2 > (1<<11) * 26)
6092 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006093 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006094
6095 if (c1 != c2)
6096 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006097
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006098 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 }
6100
6101 return (len1 < len2) ? -1 : (len1 != len2);
6102}
6103
Marc-André Lemburge5034372000-08-08 08:04:29 +00006104#else
6105
6106static int
6107unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6108{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006109 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006110
6111 Py_UNICODE *s1 = str1->str;
6112 Py_UNICODE *s2 = str2->str;
6113
6114 len1 = str1->length;
6115 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006116
Marc-André Lemburge5034372000-08-08 08:04:29 +00006117 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006118 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006119
Fredrik Lundh45714e92001-06-26 16:39:36 +00006120 c1 = *s1++;
6121 c2 = *s2++;
6122
6123 if (c1 != c2)
6124 return (c1 < c2) ? -1 : 1;
6125
Marc-André Lemburge5034372000-08-08 08:04:29 +00006126 len1--; len2--;
6127 }
6128
6129 return (len1 < len2) ? -1 : (len1 != len2);
6130}
6131
6132#endif
6133
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134int PyUnicode_Compare(PyObject *left,
6135 PyObject *right)
6136{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006137 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6138 return unicode_compare((PyUnicodeObject *)left,
6139 (PyUnicodeObject *)right);
6140 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
6141 (PyUnicode_Check(left) && PyString_Check(right))) {
6142 if (PyUnicode_Check(left))
6143 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
6144 if (PyUnicode_Check(right))
6145 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
6146 assert(PyString_Check(left));
6147 assert(PyString_Check(right));
6148 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006150 PyErr_Format(PyExc_TypeError,
6151 "Can't compare %.100s and %.100s",
6152 left->ob_type->tp_name,
6153 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 return -1;
6155}
6156
Martin v. Löwis5b222132007-06-10 09:51:05 +00006157int
6158PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6159{
6160 int i;
6161 Py_UNICODE *id;
6162 assert(PyUnicode_Check(uni));
6163 id = PyUnicode_AS_UNICODE(uni);
6164 /* Compare Unicode string and source character set string */
6165 for (i = 0; id[i] && str[i]; i++)
6166 if (id[i] != str[i])
6167 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6168 if (id[i])
6169 return 1; /* uni is longer */
6170 if (str[i])
6171 return -1; /* str is longer */
6172 return 0;
6173}
6174
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006175PyObject *PyUnicode_RichCompare(PyObject *left,
6176 PyObject *right,
6177 int op)
6178{
6179 int result;
6180
6181 result = PyUnicode_Compare(left, right);
6182 if (result == -1 && PyErr_Occurred())
6183 goto onError;
6184
6185 /* Convert the return value to a Boolean */
6186 switch (op) {
6187 case Py_EQ:
6188 result = (result == 0);
6189 break;
6190 case Py_NE:
6191 result = (result != 0);
6192 break;
6193 case Py_LE:
6194 result = (result <= 0);
6195 break;
6196 case Py_GE:
6197 result = (result >= 0);
6198 break;
6199 case Py_LT:
6200 result = (result == -1);
6201 break;
6202 case Py_GT:
6203 result = (result == 1);
6204 break;
6205 }
6206 return PyBool_FromLong(result);
6207
6208 onError:
6209
6210 /* Standard case
6211
6212 Type errors mean that PyUnicode_FromObject() could not convert
6213 one of the arguments (usually the right hand side) to Unicode,
6214 ie. we can't handle the comparison request. However, it is
6215 possible that the other object knows a comparison method, which
6216 is why we return Py_NotImplemented to give the other object a
6217 chance.
6218
6219 */
6220 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6221 PyErr_Clear();
6222 Py_INCREF(Py_NotImplemented);
6223 return Py_NotImplemented;
6224 }
6225 if (op != Py_EQ && op != Py_NE)
6226 return NULL;
6227
6228 /* Equality comparison.
6229
6230 This is a special case: we silence any PyExc_UnicodeDecodeError
6231 and instead turn it into a PyErr_UnicodeWarning.
6232
6233 */
6234 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6235 return NULL;
6236 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006237 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6238 (op == Py_EQ) ?
6239 "Unicode equal comparison "
6240 "failed to convert both arguments to Unicode - "
6241 "interpreting them as being unequal"
6242 :
6243 "Unicode unequal comparison "
6244 "failed to convert both arguments to Unicode - "
6245 "interpreting them as being unequal",
6246 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006247 return NULL;
6248 result = (op == Py_NE);
6249 return PyBool_FromLong(result);
6250}
6251
Guido van Rossum403d68b2000-03-13 15:55:09 +00006252int PyUnicode_Contains(PyObject *container,
6253 PyObject *element)
6254{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006255 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006256 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006257
6258 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006259 sub = PyUnicode_FromObject(element);
6260 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006261 PyErr_Format(PyExc_TypeError,
6262 "'in <string>' requires string as left operand, not %s",
6263 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006264 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006265 }
6266
Thomas Wouters477c8d52006-05-27 19:21:47 +00006267 str = PyUnicode_FromObject(container);
6268 if (!str) {
6269 Py_DECREF(sub);
6270 return -1;
6271 }
6272
6273 result = stringlib_contains_obj(str, sub);
6274
6275 Py_DECREF(str);
6276 Py_DECREF(sub);
6277
Guido van Rossum403d68b2000-03-13 15:55:09 +00006278 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006279}
6280
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281/* Concat to string or Unicode object giving a new Unicode object. */
6282
6283PyObject *PyUnicode_Concat(PyObject *left,
6284 PyObject *right)
6285{
6286 PyUnicodeObject *u = NULL, *v = NULL, *w;
6287
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006288 if (PyBytes_Check(left) || PyBytes_Check(right))
6289 return PyBytes_Concat(left, right);
6290
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 /* Coerce the two arguments */
6292 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6293 if (u == NULL)
6294 goto onError;
6295 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6296 if (v == NULL)
6297 goto onError;
6298
6299 /* Shortcuts */
6300 if (v == unicode_empty) {
6301 Py_DECREF(v);
6302 return (PyObject *)u;
6303 }
6304 if (u == unicode_empty) {
6305 Py_DECREF(u);
6306 return (PyObject *)v;
6307 }
6308
6309 /* Concat the two Unicode strings */
6310 w = _PyUnicode_New(u->length + v->length);
6311 if (w == NULL)
6312 goto onError;
6313 Py_UNICODE_COPY(w->str, u->str, u->length);
6314 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6315
6316 Py_DECREF(u);
6317 Py_DECREF(v);
6318 return (PyObject *)w;
6319
6320onError:
6321 Py_XDECREF(u);
6322 Py_XDECREF(v);
6323 return NULL;
6324}
6325
Walter Dörwald1ab83302007-05-18 17:15:44 +00006326void
6327PyUnicode_Append(PyObject **pleft, PyObject *right)
6328{
6329 PyObject *new;
6330 if (*pleft == NULL)
6331 return;
6332 if (right == NULL || !PyUnicode_Check(*pleft)) {
6333 Py_DECREF(*pleft);
6334 *pleft = NULL;
6335 return;
6336 }
6337 new = PyUnicode_Concat(*pleft, right);
6338 Py_DECREF(*pleft);
6339 *pleft = new;
6340}
6341
6342void
6343PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6344{
6345 PyUnicode_Append(pleft, right);
6346 Py_XDECREF(right);
6347}
6348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006349PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350"S.count(sub[, start[, end]]) -> int\n\
6351\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006352Return the number of non-overlapping occurrences of substring sub in\n\
6353Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006354interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355
6356static PyObject *
6357unicode_count(PyUnicodeObject *self, PyObject *args)
6358{
6359 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006360 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006361 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 PyObject *result;
6363
Guido van Rossumb8872e62000-05-09 14:14:27 +00006364 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6365 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366 return NULL;
6367
6368 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006369 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 if (substring == NULL)
6371 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006372
Thomas Wouters477c8d52006-05-27 19:21:47 +00006373 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374
Thomas Wouters477c8d52006-05-27 19:21:47 +00006375 result = PyInt_FromSsize_t(
6376 stringlib_count(self->str + start, end - start,
6377 substring->str, substring->length)
6378 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379
6380 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006381
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382 return result;
6383}
6384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006385PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006386"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006388Encodes S using the codec registered for encoding. encoding defaults\n\
6389to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006390handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006391a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6392'xmlcharrefreplace' as well as any other name registered with\n\
6393codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394
6395static PyObject *
6396unicode_encode(PyUnicodeObject *self, PyObject *args)
6397{
6398 char *encoding = NULL;
6399 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006400 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006401
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6403 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006404 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006405 if (v == NULL)
6406 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006407 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006408 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006409 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006410 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006411 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006412 Py_DECREF(v);
6413 return NULL;
6414 }
6415 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006416
6417 onError:
6418 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006419}
6420
6421PyDoc_STRVAR(decode__doc__,
6422"S.decode([encoding[,errors]]) -> string or unicode\n\
6423\n\
6424Decodes S using the codec registered for encoding. encoding defaults\n\
6425to the default encoding. errors may be given to set a different error\n\
6426handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6427a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6428as well as any other name registerd with codecs.register_error that is\n\
6429able to handle UnicodeDecodeErrors.");
6430
6431static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006432unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006433{
Guido van Rossuma74184e2007-08-29 04:05:57 +00006434 PyErr_Format(PyExc_TypeError, "decoding str is not supported");
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006435 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436}
6437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006438PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439"S.expandtabs([tabsize]) -> unicode\n\
6440\n\
6441Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006442If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443
6444static PyObject*
6445unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6446{
6447 Py_UNICODE *e;
6448 Py_UNICODE *p;
6449 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006450 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 PyUnicodeObject *u;
6452 int tabsize = 8;
6453
6454 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6455 return NULL;
6456
Thomas Wouters7e474022000-07-16 12:04:32 +00006457 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006458 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 e = self->str + self->length;
6460 for (p = self->str; p < e; p++)
6461 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006462 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006464 if (old_j > j) {
6465 PyErr_SetString(PyExc_OverflowError,
6466 "new string is too long");
6467 return NULL;
6468 }
6469 old_j = j;
6470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 }
6472 else {
6473 j++;
6474 if (*p == '\n' || *p == '\r') {
6475 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006476 old_j = j = 0;
6477 if (i < 0) {
6478 PyErr_SetString(PyExc_OverflowError,
6479 "new string is too long");
6480 return NULL;
6481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 }
6483 }
6484
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006485 if ((i + j) < 0) {
6486 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6487 return NULL;
6488 }
6489
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 /* Second pass: create output string and fill it */
6491 u = _PyUnicode_New(i + j);
6492 if (!u)
6493 return NULL;
6494
6495 j = 0;
6496 q = u->str;
6497
6498 for (p = self->str; p < e; p++)
6499 if (*p == '\t') {
6500 if (tabsize > 0) {
6501 i = tabsize - (j % tabsize);
6502 j += i;
6503 while (i--)
6504 *q++ = ' ';
6505 }
6506 }
6507 else {
6508 j++;
6509 *q++ = *p;
6510 if (*p == '\n' || *p == '\r')
6511 j = 0;
6512 }
6513
6514 return (PyObject*) u;
6515}
6516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006517PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518"S.find(sub [,start [,end]]) -> int\n\
6519\n\
6520Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006521such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522arguments start and end are interpreted as in slice notation.\n\
6523\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006524Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525
6526static PyObject *
6527unicode_find(PyUnicodeObject *self, PyObject *args)
6528{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006529 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006530 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006531 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006532 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533
Guido van Rossumb8872e62000-05-09 14:14:27 +00006534 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6535 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006537 substring = PyUnicode_FromObject(substring);
6538 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 return NULL;
6540
Thomas Wouters477c8d52006-05-27 19:21:47 +00006541 result = stringlib_find_slice(
6542 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6543 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6544 start, end
6545 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546
6547 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006548
6549 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550}
6551
6552static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006553unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554{
6555 if (index < 0 || index >= self->length) {
6556 PyErr_SetString(PyExc_IndexError, "string index out of range");
6557 return NULL;
6558 }
6559
6560 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6561}
6562
6563static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006564unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006566 /* Since Unicode objects compare equal to their UTF-8 string
6567 counterparts, we hash the UTF-8 string. */
6568 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6569 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570}
6571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006572PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573"S.index(sub [,start [,end]]) -> int\n\
6574\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006575Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576
6577static PyObject *
6578unicode_index(PyUnicodeObject *self, PyObject *args)
6579{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006580 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006581 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006582 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006583 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584
Guido van Rossumb8872e62000-05-09 14:14:27 +00006585 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6586 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006588 substring = PyUnicode_FromObject(substring);
6589 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 return NULL;
6591
Thomas Wouters477c8d52006-05-27 19:21:47 +00006592 result = stringlib_find_slice(
6593 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6594 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6595 start, end
6596 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597
6598 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006599
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 if (result < 0) {
6601 PyErr_SetString(PyExc_ValueError, "substring not found");
6602 return NULL;
6603 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006604
Martin v. Löwis18e16552006-02-15 17:27:45 +00006605 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606}
6607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006608PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006609"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006611Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006612at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613
6614static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006615unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616{
6617 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6618 register const Py_UNICODE *e;
6619 int cased;
6620
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 /* Shortcut for single character strings */
6622 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006623 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006625 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006626 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006627 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006628
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 e = p + PyUnicode_GET_SIZE(self);
6630 cased = 0;
6631 for (; p < e; p++) {
6632 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006633
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006635 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 else if (!cased && Py_UNICODE_ISLOWER(ch))
6637 cased = 1;
6638 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006639 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640}
6641
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006642PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006643"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006645Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006646at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647
6648static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006649unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650{
6651 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6652 register const Py_UNICODE *e;
6653 int cased;
6654
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 /* Shortcut for single character strings */
6656 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006657 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006659 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006660 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006661 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006662
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 e = p + PyUnicode_GET_SIZE(self);
6664 cased = 0;
6665 for (; p < e; p++) {
6666 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006667
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006669 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 else if (!cased && Py_UNICODE_ISUPPER(ch))
6671 cased = 1;
6672 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006673 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674}
6675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006676PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006677"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006679Return True if S is a titlecased string and there is at least one\n\
6680character in S, i.e. upper- and titlecase characters may only\n\
6681follow uncased characters and lowercase characters only cased ones.\n\
6682Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683
6684static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006685unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686{
6687 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6688 register const Py_UNICODE *e;
6689 int cased, previous_is_cased;
6690
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 /* Shortcut for single character strings */
6692 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006693 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6694 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006696 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006697 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006698 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006699
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 e = p + PyUnicode_GET_SIZE(self);
6701 cased = 0;
6702 previous_is_cased = 0;
6703 for (; p < e; p++) {
6704 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006705
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6707 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006708 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 previous_is_cased = 1;
6710 cased = 1;
6711 }
6712 else if (Py_UNICODE_ISLOWER(ch)) {
6713 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006714 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715 previous_is_cased = 1;
6716 cased = 1;
6717 }
6718 else
6719 previous_is_cased = 0;
6720 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006721 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722}
6723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006724PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006725"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006727Return True if all characters in S are whitespace\n\
6728and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729
6730static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006731unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732{
6733 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6734 register const Py_UNICODE *e;
6735
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 /* Shortcut for single character strings */
6737 if (PyUnicode_GET_SIZE(self) == 1 &&
6738 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006739 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006741 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006742 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006743 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006744
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 e = p + PyUnicode_GET_SIZE(self);
6746 for (; p < e; p++) {
6747 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006748 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006750 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751}
6752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006753PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006754"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006755\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006756Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006757and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006758
6759static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006760unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006761{
6762 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6763 register const Py_UNICODE *e;
6764
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006765 /* Shortcut for single character strings */
6766 if (PyUnicode_GET_SIZE(self) == 1 &&
6767 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006768 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006769
6770 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006771 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006772 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006773
6774 e = p + PyUnicode_GET_SIZE(self);
6775 for (; p < e; p++) {
6776 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006777 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006778 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006779 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006780}
6781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006782PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006783"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006784\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006785Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006786and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006787
6788static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006789unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006790{
6791 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6792 register const Py_UNICODE *e;
6793
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006794 /* Shortcut for single character strings */
6795 if (PyUnicode_GET_SIZE(self) == 1 &&
6796 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006797 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006798
6799 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006800 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006801 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006802
6803 e = p + PyUnicode_GET_SIZE(self);
6804 for (; p < e; p++) {
6805 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006806 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006807 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006808 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006809}
6810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006811PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006812"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006814Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006815False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816
6817static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006818unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819{
6820 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6821 register const Py_UNICODE *e;
6822
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 /* Shortcut for single character strings */
6824 if (PyUnicode_GET_SIZE(self) == 1 &&
6825 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006826 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006828 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006829 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006830 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006831
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832 e = p + PyUnicode_GET_SIZE(self);
6833 for (; p < e; p++) {
6834 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006835 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006837 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838}
6839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006840PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006841"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006843Return True if all characters in S are digits\n\
6844and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845
6846static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006847unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848{
6849 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6850 register const Py_UNICODE *e;
6851
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 /* Shortcut for single character strings */
6853 if (PyUnicode_GET_SIZE(self) == 1 &&
6854 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006855 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006857 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006858 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006859 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006860
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 e = p + PyUnicode_GET_SIZE(self);
6862 for (; p < e; p++) {
6863 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006864 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006866 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867}
6868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006869PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006870"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006872Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006873False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874
6875static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006876unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877{
6878 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6879 register const Py_UNICODE *e;
6880
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 /* Shortcut for single character strings */
6882 if (PyUnicode_GET_SIZE(self) == 1 &&
6883 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006884 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006886 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006887 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006888 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006889
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 e = p + PyUnicode_GET_SIZE(self);
6891 for (; p < e; p++) {
6892 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006893 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006895 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896}
6897
Martin v. Löwis47383402007-08-15 07:32:56 +00006898int
6899PyUnicode_IsIdentifier(PyObject *self)
6900{
6901 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6902 register const Py_UNICODE *e;
6903
6904 /* Special case for empty strings */
6905 if (PyUnicode_GET_SIZE(self) == 0)
6906 return 0;
6907
6908 /* PEP 3131 says that the first character must be in
6909 XID_Start and subsequent characters in XID_Continue,
6910 and for the ASCII range, the 2.x rules apply (i.e
6911 start with letters and underscore, continue with
6912 letters, digits, underscore). However, given the current
6913 definition of XID_Start and XID_Continue, it is sufficient
6914 to check just for these, except that _ must be allowed
6915 as starting an identifier. */
6916 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6917 return 0;
6918
6919 e = p + PyUnicode_GET_SIZE(self);
6920 for (p++; p < e; p++) {
6921 if (!_PyUnicode_IsXidContinue(*p))
6922 return 0;
6923 }
6924 return 1;
6925}
6926
6927PyDoc_STRVAR(isidentifier__doc__,
6928"S.isidentifier() -> bool\n\
6929\n\
6930Return True if S is a valid identifier according\n\
6931to the language definition.");
6932
6933static PyObject*
6934unicode_isidentifier(PyObject *self)
6935{
6936 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6937}
6938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006939PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940"S.join(sequence) -> unicode\n\
6941\n\
6942Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006943sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944
6945static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006946unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006948 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949}
6950
Martin v. Löwis18e16552006-02-15 17:27:45 +00006951static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952unicode_length(PyUnicodeObject *self)
6953{
6954 return self->length;
6955}
6956
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006957PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006958"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959\n\
6960Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006961done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962
6963static PyObject *
6964unicode_ljust(PyUnicodeObject *self, PyObject *args)
6965{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006966 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006967 Py_UNICODE fillchar = ' ';
6968
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006969 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970 return NULL;
6971
Tim Peters7a29bd52001-09-12 03:03:31 +00006972 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 Py_INCREF(self);
6974 return (PyObject*) self;
6975 }
6976
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006977 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978}
6979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006980PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981"S.lower() -> unicode\n\
6982\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006983Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984
6985static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006986unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988 return fixup(self, fixlower);
6989}
6990
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006991#define LEFTSTRIP 0
6992#define RIGHTSTRIP 1
6993#define BOTHSTRIP 2
6994
6995/* Arrays indexed by above */
6996static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6997
6998#define STRIPNAME(i) (stripformat[i]+3)
6999
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007000/* externally visible for str.strip(unicode) */
7001PyObject *
7002_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7003{
7004 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007005 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007006 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007007 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7008 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007009
Thomas Wouters477c8d52006-05-27 19:21:47 +00007010 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7011
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007012 i = 0;
7013 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007014 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7015 i++;
7016 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007017 }
7018
7019 j = len;
7020 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007021 do {
7022 j--;
7023 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7024 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007025 }
7026
7027 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007028 Py_INCREF(self);
7029 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007030 }
7031 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007032 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007033}
7034
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035
7036static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007037do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007039 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007040 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007041
7042 i = 0;
7043 if (striptype != RIGHTSTRIP) {
7044 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7045 i++;
7046 }
7047 }
7048
7049 j = len;
7050 if (striptype != LEFTSTRIP) {
7051 do {
7052 j--;
7053 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7054 j++;
7055 }
7056
7057 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7058 Py_INCREF(self);
7059 return (PyObject*)self;
7060 }
7061 else
7062 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063}
7064
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007065
7066static PyObject *
7067do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7068{
7069 PyObject *sep = NULL;
7070
7071 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7072 return NULL;
7073
7074 if (sep != NULL && sep != Py_None) {
7075 if (PyUnicode_Check(sep))
7076 return _PyUnicode_XStrip(self, striptype, sep);
7077 else if (PyString_Check(sep)) {
7078 PyObject *res;
7079 sep = PyUnicode_FromObject(sep);
7080 if (sep==NULL)
7081 return NULL;
7082 res = _PyUnicode_XStrip(self, striptype, sep);
7083 Py_DECREF(sep);
7084 return res;
7085 }
7086 else {
7087 PyErr_Format(PyExc_TypeError,
7088 "%s arg must be None, unicode or str",
7089 STRIPNAME(striptype));
7090 return NULL;
7091 }
7092 }
7093
7094 return do_strip(self, striptype);
7095}
7096
7097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007098PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007099"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007100\n\
7101Return a copy of the string S with leading and trailing\n\
7102whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007103If chars is given and not None, remove characters in chars instead.\n\
7104If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007105
7106static PyObject *
7107unicode_strip(PyUnicodeObject *self, PyObject *args)
7108{
7109 if (PyTuple_GET_SIZE(args) == 0)
7110 return do_strip(self, BOTHSTRIP); /* Common case */
7111 else
7112 return do_argstrip(self, BOTHSTRIP, args);
7113}
7114
7115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007116PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007117"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007118\n\
7119Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007120If chars is given and not None, remove characters in chars instead.\n\
7121If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007122
7123static PyObject *
7124unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7125{
7126 if (PyTuple_GET_SIZE(args) == 0)
7127 return do_strip(self, LEFTSTRIP); /* Common case */
7128 else
7129 return do_argstrip(self, LEFTSTRIP, args);
7130}
7131
7132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007133PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007134"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007135\n\
7136Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007137If chars is given and not None, remove characters in chars instead.\n\
7138If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007139
7140static PyObject *
7141unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7142{
7143 if (PyTuple_GET_SIZE(args) == 0)
7144 return do_strip(self, RIGHTSTRIP); /* Common case */
7145 else
7146 return do_argstrip(self, RIGHTSTRIP, args);
7147}
7148
7149
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007151unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152{
7153 PyUnicodeObject *u;
7154 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007155 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007156 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157
7158 if (len < 0)
7159 len = 0;
7160
Tim Peters7a29bd52001-09-12 03:03:31 +00007161 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 /* no repeat, return original string */
7163 Py_INCREF(str);
7164 return (PyObject*) str;
7165 }
Tim Peters8f422462000-09-09 06:13:41 +00007166
7167 /* ensure # of chars needed doesn't overflow int and # of bytes
7168 * needed doesn't overflow size_t
7169 */
7170 nchars = len * str->length;
7171 if (len && nchars / len != str->length) {
7172 PyErr_SetString(PyExc_OverflowError,
7173 "repeated string is too long");
7174 return NULL;
7175 }
7176 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7177 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7178 PyErr_SetString(PyExc_OverflowError,
7179 "repeated string is too long");
7180 return NULL;
7181 }
7182 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 if (!u)
7184 return NULL;
7185
7186 p = u->str;
7187
Thomas Wouters477c8d52006-05-27 19:21:47 +00007188 if (str->length == 1 && len > 0) {
7189 Py_UNICODE_FILL(p, str->str[0], len);
7190 } else {
7191 Py_ssize_t done = 0; /* number of characters copied this far */
7192 if (done < nchars) {
7193 Py_UNICODE_COPY(p, str->str, str->length);
7194 done = str->length;
7195 }
7196 while (done < nchars) {
7197 int n = (done <= nchars-done) ? done : nchars-done;
7198 Py_UNICODE_COPY(p+done, p, n);
7199 done += n;
7200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 }
7202
7203 return (PyObject*) u;
7204}
7205
7206PyObject *PyUnicode_Replace(PyObject *obj,
7207 PyObject *subobj,
7208 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007209 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210{
7211 PyObject *self;
7212 PyObject *str1;
7213 PyObject *str2;
7214 PyObject *result;
7215
7216 self = PyUnicode_FromObject(obj);
7217 if (self == NULL)
7218 return NULL;
7219 str1 = PyUnicode_FromObject(subobj);
7220 if (str1 == NULL) {
7221 Py_DECREF(self);
7222 return NULL;
7223 }
7224 str2 = PyUnicode_FromObject(replobj);
7225 if (str2 == NULL) {
7226 Py_DECREF(self);
7227 Py_DECREF(str1);
7228 return NULL;
7229 }
Tim Petersced69f82003-09-16 20:30:58 +00007230 result = replace((PyUnicodeObject *)self,
7231 (PyUnicodeObject *)str1,
7232 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233 maxcount);
7234 Py_DECREF(self);
7235 Py_DECREF(str1);
7236 Py_DECREF(str2);
7237 return result;
7238}
7239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007240PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241"S.replace (old, new[, maxsplit]) -> unicode\n\
7242\n\
7243Return a copy of S with all occurrences of substring\n\
7244old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007245given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246
7247static PyObject*
7248unicode_replace(PyUnicodeObject *self, PyObject *args)
7249{
7250 PyUnicodeObject *str1;
7251 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007252 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253 PyObject *result;
7254
Martin v. Löwis18e16552006-02-15 17:27:45 +00007255 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 return NULL;
7257 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7258 if (str1 == NULL)
7259 return NULL;
7260 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007261 if (str2 == NULL) {
7262 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265
7266 result = replace(self, str1, str2, maxcount);
7267
7268 Py_DECREF(str1);
7269 Py_DECREF(str2);
7270 return result;
7271}
7272
7273static
7274PyObject *unicode_repr(PyObject *unicode)
7275{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007276 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007277 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007278 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7279 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7280
7281 /* XXX(nnorwitz): rather than over-allocating, it would be
7282 better to choose a different scheme. Perhaps scan the
7283 first N-chars of the string and allocate based on that size.
7284 */
7285 /* Initial allocation is based on the longest-possible unichr
7286 escape.
7287
7288 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7289 unichr, so in this case it's the longest unichr escape. In
7290 narrow (UTF-16) builds this is five chars per source unichr
7291 since there are two unichrs in the surrogate pair, so in narrow
7292 (UTF-16) builds it's not the longest unichr escape.
7293
7294 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7295 so in the narrow (UTF-16) build case it's the longest unichr
7296 escape.
7297 */
7298
Walter Dörwald1ab83302007-05-18 17:15:44 +00007299 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007300 2 /* quotes */
7301#ifdef Py_UNICODE_WIDE
7302 + 10*size
7303#else
7304 + 6*size
7305#endif
7306 + 1);
7307 if (repr == NULL)
7308 return NULL;
7309
Walter Dörwald1ab83302007-05-18 17:15:44 +00007310 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007311
7312 /* Add quote */
7313 *p++ = (findchar(s, size, '\'') &&
7314 !findchar(s, size, '"')) ? '"' : '\'';
7315 while (size-- > 0) {
7316 Py_UNICODE ch = *s++;
7317
7318 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007319 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007320 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007321 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007322 continue;
7323 }
7324
7325#ifdef Py_UNICODE_WIDE
7326 /* Map 21-bit characters to '\U00xxxxxx' */
7327 else if (ch >= 0x10000) {
7328 *p++ = '\\';
7329 *p++ = 'U';
7330 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7331 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7332 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7333 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7334 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7335 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7336 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7337 *p++ = hexdigits[ch & 0x0000000F];
7338 continue;
7339 }
7340#else
7341 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7342 else if (ch >= 0xD800 && ch < 0xDC00) {
7343 Py_UNICODE ch2;
7344 Py_UCS4 ucs;
7345
7346 ch2 = *s++;
7347 size--;
7348 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7349 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7350 *p++ = '\\';
7351 *p++ = 'U';
7352 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7353 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7354 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7355 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7356 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7357 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7358 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7359 *p++ = hexdigits[ucs & 0x0000000F];
7360 continue;
7361 }
7362 /* Fall through: isolated surrogates are copied as-is */
7363 s--;
7364 size++;
7365 }
7366#endif
7367
7368 /* Map 16-bit characters to '\uxxxx' */
7369 if (ch >= 256) {
7370 *p++ = '\\';
7371 *p++ = 'u';
7372 *p++ = hexdigits[(ch >> 12) & 0x000F];
7373 *p++ = hexdigits[(ch >> 8) & 0x000F];
7374 *p++ = hexdigits[(ch >> 4) & 0x000F];
7375 *p++ = hexdigits[ch & 0x000F];
7376 }
7377
7378 /* Map special whitespace to '\t', \n', '\r' */
7379 else if (ch == '\t') {
7380 *p++ = '\\';
7381 *p++ = 't';
7382 }
7383 else if (ch == '\n') {
7384 *p++ = '\\';
7385 *p++ = 'n';
7386 }
7387 else if (ch == '\r') {
7388 *p++ = '\\';
7389 *p++ = 'r';
7390 }
7391
7392 /* Map non-printable US ASCII to '\xhh' */
7393 else if (ch < ' ' || ch >= 0x7F) {
7394 *p++ = '\\';
7395 *p++ = 'x';
7396 *p++ = hexdigits[(ch >> 4) & 0x000F];
7397 *p++ = hexdigits[ch & 0x000F];
7398 }
7399
7400 /* Copy everything else as-is */
7401 else
7402 *p++ = (char) ch;
7403 }
7404 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007405 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007406
7407 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007408 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007409 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410}
7411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007412PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413"S.rfind(sub [,start [,end]]) -> int\n\
7414\n\
7415Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007416such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417arguments start and end are interpreted as in slice notation.\n\
7418\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007419Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420
7421static PyObject *
7422unicode_rfind(PyUnicodeObject *self, PyObject *args)
7423{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007424 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007425 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007426 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007427 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428
Guido van Rossumb8872e62000-05-09 14:14:27 +00007429 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7430 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007432 substring = PyUnicode_FromObject(substring);
7433 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434 return NULL;
7435
Thomas Wouters477c8d52006-05-27 19:21:47 +00007436 result = stringlib_rfind_slice(
7437 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7438 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7439 start, end
7440 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441
7442 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007443
7444 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445}
7446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007447PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448"S.rindex(sub [,start [,end]]) -> int\n\
7449\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007450Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451
7452static PyObject *
7453unicode_rindex(PyUnicodeObject *self, PyObject *args)
7454{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007455 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007456 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007457 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007458 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459
Guido van Rossumb8872e62000-05-09 14:14:27 +00007460 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7461 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007463 substring = PyUnicode_FromObject(substring);
7464 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465 return NULL;
7466
Thomas Wouters477c8d52006-05-27 19:21:47 +00007467 result = stringlib_rfind_slice(
7468 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7469 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7470 start, end
7471 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472
7473 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007474
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 if (result < 0) {
7476 PyErr_SetString(PyExc_ValueError, "substring not found");
7477 return NULL;
7478 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007479 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480}
7481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007482PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007483"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484\n\
7485Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007486done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487
7488static PyObject *
7489unicode_rjust(PyUnicodeObject *self, PyObject *args)
7490{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007491 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007492 Py_UNICODE fillchar = ' ';
7493
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007494 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 return NULL;
7496
Tim Peters7a29bd52001-09-12 03:03:31 +00007497 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498 Py_INCREF(self);
7499 return (PyObject*) self;
7500 }
7501
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007502 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503}
7504
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505PyObject *PyUnicode_Split(PyObject *s,
7506 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007507 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508{
7509 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007510
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 s = PyUnicode_FromObject(s);
7512 if (s == NULL)
7513 return NULL;
7514 if (sep != NULL) {
7515 sep = PyUnicode_FromObject(sep);
7516 if (sep == NULL) {
7517 Py_DECREF(s);
7518 return NULL;
7519 }
7520 }
7521
7522 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7523
7524 Py_DECREF(s);
7525 Py_XDECREF(sep);
7526 return result;
7527}
7528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007529PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530"S.split([sep [,maxsplit]]) -> list of strings\n\
7531\n\
7532Return a list of the words in S, using sep as the\n\
7533delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007534splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007535any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536
7537static PyObject*
7538unicode_split(PyUnicodeObject *self, PyObject *args)
7539{
7540 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007541 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542
Martin v. Löwis18e16552006-02-15 17:27:45 +00007543 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544 return NULL;
7545
7546 if (substring == Py_None)
7547 return split(self, NULL, maxcount);
7548 else if (PyUnicode_Check(substring))
7549 return split(self, (PyUnicodeObject *)substring, maxcount);
7550 else
7551 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7552}
7553
Thomas Wouters477c8d52006-05-27 19:21:47 +00007554PyObject *
7555PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7556{
7557 PyObject* str_obj;
7558 PyObject* sep_obj;
7559 PyObject* out;
7560
7561 str_obj = PyUnicode_FromObject(str_in);
7562 if (!str_obj)
7563 return NULL;
7564 sep_obj = PyUnicode_FromObject(sep_in);
7565 if (!sep_obj) {
7566 Py_DECREF(str_obj);
7567 return NULL;
7568 }
7569
7570 out = stringlib_partition(
7571 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7572 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7573 );
7574
7575 Py_DECREF(sep_obj);
7576 Py_DECREF(str_obj);
7577
7578 return out;
7579}
7580
7581
7582PyObject *
7583PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7584{
7585 PyObject* str_obj;
7586 PyObject* sep_obj;
7587 PyObject* out;
7588
7589 str_obj = PyUnicode_FromObject(str_in);
7590 if (!str_obj)
7591 return NULL;
7592 sep_obj = PyUnicode_FromObject(sep_in);
7593 if (!sep_obj) {
7594 Py_DECREF(str_obj);
7595 return NULL;
7596 }
7597
7598 out = stringlib_rpartition(
7599 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7600 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7601 );
7602
7603 Py_DECREF(sep_obj);
7604 Py_DECREF(str_obj);
7605
7606 return out;
7607}
7608
7609PyDoc_STRVAR(partition__doc__,
7610"S.partition(sep) -> (head, sep, tail)\n\
7611\n\
7612Searches for the separator sep in S, and returns the part before it,\n\
7613the separator itself, and the part after it. If the separator is not\n\
7614found, returns S and two empty strings.");
7615
7616static PyObject*
7617unicode_partition(PyUnicodeObject *self, PyObject *separator)
7618{
7619 return PyUnicode_Partition((PyObject *)self, separator);
7620}
7621
7622PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007623"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007624\n\
7625Searches for the separator sep in S, starting at the end of S, and returns\n\
7626the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007627separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007628
7629static PyObject*
7630unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7631{
7632 return PyUnicode_RPartition((PyObject *)self, separator);
7633}
7634
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007635PyObject *PyUnicode_RSplit(PyObject *s,
7636 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007637 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007638{
7639 PyObject *result;
7640
7641 s = PyUnicode_FromObject(s);
7642 if (s == NULL)
7643 return NULL;
7644 if (sep != NULL) {
7645 sep = PyUnicode_FromObject(sep);
7646 if (sep == NULL) {
7647 Py_DECREF(s);
7648 return NULL;
7649 }
7650 }
7651
7652 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7653
7654 Py_DECREF(s);
7655 Py_XDECREF(sep);
7656 return result;
7657}
7658
7659PyDoc_STRVAR(rsplit__doc__,
7660"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7661\n\
7662Return a list of the words in S, using sep as the\n\
7663delimiter string, starting at the end of the string and\n\
7664working to the front. If maxsplit is given, at most maxsplit\n\
7665splits are done. If sep is not specified, any whitespace string\n\
7666is a separator.");
7667
7668static PyObject*
7669unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7670{
7671 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007672 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007673
Martin v. Löwis18e16552006-02-15 17:27:45 +00007674 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007675 return NULL;
7676
7677 if (substring == Py_None)
7678 return rsplit(self, NULL, maxcount);
7679 else if (PyUnicode_Check(substring))
7680 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7681 else
7682 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7683}
7684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007685PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007686"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687\n\
7688Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007689Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007690is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691
7692static PyObject*
7693unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7694{
Guido van Rossum86662912000-04-11 15:38:46 +00007695 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696
Guido van Rossum86662912000-04-11 15:38:46 +00007697 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 return NULL;
7699
Guido van Rossum86662912000-04-11 15:38:46 +00007700 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701}
7702
7703static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007704PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705{
Walter Dörwald346737f2007-05-31 10:44:43 +00007706 if (PyUnicode_CheckExact(self)) {
7707 Py_INCREF(self);
7708 return self;
7709 } else
7710 /* Subtype -- return genuine unicode string with the same value. */
7711 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7712 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713}
7714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007715PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716"S.swapcase() -> unicode\n\
7717\n\
7718Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007719and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720
7721static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007722unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 return fixup(self, fixswapcase);
7725}
7726
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007727PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728"S.translate(table) -> unicode\n\
7729\n\
7730Return a copy of the string S, where all characters have been mapped\n\
7731through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007732Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7733Unmapped characters are left untouched. Characters mapped to None\n\
7734are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735
7736static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007737unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738{
Tim Petersced69f82003-09-16 20:30:58 +00007739 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007741 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742 "ignore");
7743}
7744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007745PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746"S.upper() -> unicode\n\
7747\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007748Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749
7750static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007751unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753 return fixup(self, fixupper);
7754}
7755
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007756PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757"S.zfill(width) -> unicode\n\
7758\n\
7759Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007760of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761
7762static PyObject *
7763unicode_zfill(PyUnicodeObject *self, PyObject *args)
7764{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007765 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 PyUnicodeObject *u;
7767
Martin v. Löwis18e16552006-02-15 17:27:45 +00007768 Py_ssize_t width;
7769 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770 return NULL;
7771
7772 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007773 if (PyUnicode_CheckExact(self)) {
7774 Py_INCREF(self);
7775 return (PyObject*) self;
7776 }
7777 else
7778 return PyUnicode_FromUnicode(
7779 PyUnicode_AS_UNICODE(self),
7780 PyUnicode_GET_SIZE(self)
7781 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782 }
7783
7784 fill = width - self->length;
7785
7786 u = pad(self, fill, 0, '0');
7787
Walter Dörwald068325e2002-04-15 13:36:47 +00007788 if (u == NULL)
7789 return NULL;
7790
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791 if (u->str[fill] == '+' || u->str[fill] == '-') {
7792 /* move sign to beginning of string */
7793 u->str[0] = u->str[fill];
7794 u->str[fill] = '0';
7795 }
7796
7797 return (PyObject*) u;
7798}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799
7800#if 0
7801static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007802unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 return PyInt_FromLong(unicode_freelist_size);
7805}
7806#endif
7807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007808PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007809"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007811Return True if S starts with the specified prefix, False otherwise.\n\
7812With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007813With optional end, stop comparing S at that position.\n\
7814prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815
7816static PyObject *
7817unicode_startswith(PyUnicodeObject *self,
7818 PyObject *args)
7819{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007820 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007822 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007823 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007824 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007826 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007827 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007829 if (PyTuple_Check(subobj)) {
7830 Py_ssize_t i;
7831 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7832 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7833 PyTuple_GET_ITEM(subobj, i));
7834 if (substring == NULL)
7835 return NULL;
7836 result = tailmatch(self, substring, start, end, -1);
7837 Py_DECREF(substring);
7838 if (result) {
7839 Py_RETURN_TRUE;
7840 }
7841 }
7842 /* nothing matched */
7843 Py_RETURN_FALSE;
7844 }
7845 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007847 return NULL;
7848 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007850 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851}
7852
7853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007854PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007855"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007857Return True if S ends with the specified suffix, False otherwise.\n\
7858With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007859With optional end, stop comparing S at that position.\n\
7860suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861
7862static PyObject *
7863unicode_endswith(PyUnicodeObject *self,
7864 PyObject *args)
7865{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007866 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007868 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007869 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007870 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007872 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7873 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007875 if (PyTuple_Check(subobj)) {
7876 Py_ssize_t i;
7877 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7878 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7879 PyTuple_GET_ITEM(subobj, i));
7880 if (substring == NULL)
7881 return NULL;
7882 result = tailmatch(self, substring, start, end, +1);
7883 Py_DECREF(substring);
7884 if (result) {
7885 Py_RETURN_TRUE;
7886 }
7887 }
7888 Py_RETURN_FALSE;
7889 }
7890 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007892 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007894 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007896 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897}
7898
Eric Smith8c663262007-08-25 02:26:07 +00007899#include "stringlib/string_format.h"
7900
7901PyDoc_STRVAR(format__doc__,
7902"S.format(*args, **kwargs) -> unicode\n\
7903\n\
7904");
7905
7906static PyObject *
7907unicode_format(PyObject *self, PyObject *args, PyObject *kwds)
7908{
7909 /* this calls into stringlib/string_format.h because it can be
7910 included for either string or unicode. this is needed for
7911 python 2.6. */
7912 return do_string_format(self, args, kwds);
7913}
7914
7915
7916PyDoc_STRVAR(p_format__doc__,
7917"S.__format__(format_spec) -> unicode\n\
7918\n\
7919");
7920
7921static PyObject *
7922unicode__format__(PyObject *self, PyObject *args)
7923{
7924 return unicode_unicode__format__(self, args);
7925}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007927
7928static PyObject *
7929unicode_getnewargs(PyUnicodeObject *v)
7930{
7931 return Py_BuildValue("(u#)", v->str, v->length);
7932}
7933
7934
Guido van Rossumd57fd912000-03-10 22:53:23 +00007935static PyMethodDef unicode_methods[] = {
7936
7937 /* Order is according to common usage: often used methods should
7938 appear first, since lookup is done sequentially. */
7939
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007940 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7941 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7942 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007943 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007944 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7945 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7946 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7947 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7948 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7949 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7950 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007951 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007952 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7953 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7954 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007955 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007956 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007957/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7958 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7959 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7960 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007961 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007962 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007963 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007964 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007965 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7966 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7967 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7968 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7969 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7970 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7971 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7972 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7973 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7974 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7975 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7976 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7977 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7978 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00007979 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007980 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith8c663262007-08-25 02:26:07 +00007981 {"format", (PyCFunction) unicode_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7982 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00007983 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7984 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00007985#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007986 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987#endif
7988
7989#if 0
7990 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007991 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992#endif
7993
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007994 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 {NULL, NULL}
7996};
7997
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007998static PyObject *
7999unicode_mod(PyObject *v, PyObject *w)
8000{
8001 if (!PyUnicode_Check(v)) {
8002 Py_INCREF(Py_NotImplemented);
8003 return Py_NotImplemented;
8004 }
8005 return PyUnicode_Format(v, w);
8006}
8007
8008static PyNumberMethods unicode_as_number = {
8009 0, /*nb_add*/
8010 0, /*nb_subtract*/
8011 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008012 unicode_mod, /*nb_remainder*/
8013};
8014
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008016 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008017 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008018 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8019 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008020 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 0, /* sq_ass_item */
8022 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008023 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024};
8025
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008026static PyObject*
8027unicode_subscript(PyUnicodeObject* self, PyObject* item)
8028{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008029 if (PyIndex_Check(item)) {
8030 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008031 if (i == -1 && PyErr_Occurred())
8032 return NULL;
8033 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008034 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008035 return unicode_getitem(self, i);
8036 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008037 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008038 Py_UNICODE* source_buf;
8039 Py_UNICODE* result_buf;
8040 PyObject* result;
8041
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008042 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008043 &start, &stop, &step, &slicelength) < 0) {
8044 return NULL;
8045 }
8046
8047 if (slicelength <= 0) {
8048 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008049 } else if (start == 0 && step == 1 && slicelength == self->length &&
8050 PyUnicode_CheckExact(self)) {
8051 Py_INCREF(self);
8052 return (PyObject *)self;
8053 } else if (step == 1) {
8054 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008055 } else {
8056 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008057 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8058 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008059
8060 if (result_buf == NULL)
8061 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008062
8063 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8064 result_buf[i] = source_buf[cur];
8065 }
Tim Petersced69f82003-09-16 20:30:58 +00008066
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008067 result = PyUnicode_FromUnicode(result_buf, slicelength);
8068 PyMem_FREE(result_buf);
8069 return result;
8070 }
8071 } else {
8072 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8073 return NULL;
8074 }
8075}
8076
8077static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008078 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008079 (binaryfunc)unicode_subscript, /* mp_subscript */
8080 (objobjargproc)0, /* mp_ass_subscript */
8081};
8082
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083
8084static int
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008085unicode_buffer_getbuffer(PyUnicodeObject *self, PyBuffer *view, int flags)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008087
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008088 if (flags & PyBUF_CHARACTER) {
Guido van Rossuma74184e2007-08-29 04:05:57 +00008089 PyErr_SetString(PyExc_SystemError, "can't use str as char buffer");
8090 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 }
Guido van Rossuma74184e2007-08-29 04:05:57 +00008092 return PyBuffer_FillInfo(view, (void *)self->str,
8093 PyUnicode_GET_DATA_SIZE(self), 1, flags);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094}
8095
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008096
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097/* Helpers for PyUnicode_Format() */
8098
8099static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008100getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008102 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 if (argidx < arglen) {
8104 (*p_argidx)++;
8105 if (arglen < 0)
8106 return args;
8107 else
8108 return PyTuple_GetItem(args, argidx);
8109 }
8110 PyErr_SetString(PyExc_TypeError,
8111 "not enough arguments for format string");
8112 return NULL;
8113}
8114
8115#define F_LJUST (1<<0)
8116#define F_SIGN (1<<1)
8117#define F_BLANK (1<<2)
8118#define F_ALT (1<<3)
8119#define F_ZERO (1<<4)
8120
Martin v. Löwis18e16552006-02-15 17:27:45 +00008121static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008122strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008124 register Py_ssize_t i;
8125 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126 for (i = len - 1; i >= 0; i--)
8127 buffer[i] = (Py_UNICODE) charbuffer[i];
8128
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 return len;
8130}
8131
Neal Norwitzfc76d632006-01-10 06:03:13 +00008132static int
8133doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8134{
Tim Peters15231542006-02-16 01:08:01 +00008135 Py_ssize_t result;
8136
Neal Norwitzfc76d632006-01-10 06:03:13 +00008137 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008138 result = strtounicode(buffer, (char *)buffer);
8139 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008140}
8141
8142static int
8143longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8144{
Tim Peters15231542006-02-16 01:08:01 +00008145 Py_ssize_t result;
8146
Neal Norwitzfc76d632006-01-10 06:03:13 +00008147 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008148 result = strtounicode(buffer, (char *)buffer);
8149 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008150}
8151
Guido van Rossum078151d2002-08-11 04:24:12 +00008152/* XXX To save some code duplication, formatfloat/long/int could have been
8153 shared with stringobject.c, converting from 8-bit to Unicode after the
8154 formatting is done. */
8155
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156static int
8157formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008158 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 int flags,
8160 int prec,
8161 int type,
8162 PyObject *v)
8163{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008164 /* fmt = '%#.' + `prec` + `type`
8165 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 char fmt[20];
8167 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008168
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169 x = PyFloat_AsDouble(v);
8170 if (x == -1.0 && PyErr_Occurred())
8171 return -1;
8172 if (prec < 0)
8173 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8175 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008176 /* Worst case length calc to ensure no buffer overrun:
8177
8178 'g' formats:
8179 fmt = %#.<prec>g
8180 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8181 for any double rep.)
8182 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8183
8184 'f' formats:
8185 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8186 len = 1 + 50 + 1 + prec = 52 + prec
8187
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008188 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008189 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008190
8191 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008192 if (((type == 'g' || type == 'G') &&
8193 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008194 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008195 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008196 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008197 return -1;
8198 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008199 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8200 (flags&F_ALT) ? "#" : "",
8201 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008202 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203}
8204
Tim Peters38fd5b62000-09-21 05:43:11 +00008205static PyObject*
8206formatlong(PyObject *val, int flags, int prec, int type)
8207{
8208 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008209 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008210 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008211 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008212
8213 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8214 if (!str)
8215 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008216 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008217 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008218 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008219}
8220
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221static int
8222formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008223 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 int flags,
8225 int prec,
8226 int type,
8227 PyObject *v)
8228{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008229 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008230 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8231 * + 1 + 1
8232 * = 24
8233 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008234 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008235 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 long x;
8237
8238 x = PyInt_AsLong(v);
8239 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008240 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008241 if (x < 0 && type == 'u') {
8242 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008243 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008244 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8245 sign = "-";
8246 else
8247 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008249 prec = 1;
8250
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008251 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8252 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008253 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008254 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008255 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008256 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008257 return -1;
8258 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008259
8260 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008261 (type == 'x' || type == 'X' || type == 'o')) {
8262 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008263 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008264 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008265 * - when 0 is being converted, the C standard leaves off
8266 * the '0x' or '0X', which is inconsistent with other
8267 * %#x/%#X conversions and inconsistent with Python's
8268 * hex() function
8269 * - there are platforms that violate the standard and
8270 * convert 0 with the '0x' or '0X'
8271 * (Metrowerks, Compaq Tru64)
8272 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008273 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008274 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008275 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008276 * We can achieve the desired consistency by inserting our
8277 * own '0x' or '0X' prefix, and substituting %x/%X in place
8278 * of %#x/%#X.
8279 *
8280 * Note that this is the same approach as used in
8281 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008282 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008283 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8284 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008285 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008286 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008287 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8288 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008289 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008290 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008291 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008292 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008293 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008294 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295}
8296
8297static int
8298formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008299 size_t buflen,
8300 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008302 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008303 if (PyUnicode_Check(v)) {
8304 if (PyUnicode_GET_SIZE(v) != 1)
8305 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008309 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008310 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008311 goto onError;
8312 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8313 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314
8315 else {
8316 /* Integer input truncated to a character */
8317 long x;
8318 x = PyInt_AsLong(v);
8319 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008320 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008321#ifdef Py_UNICODE_WIDE
8322 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008323 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008324 "%c arg not in range(0x110000) "
8325 "(wide Python build)");
8326 return -1;
8327 }
8328#else
8329 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008330 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008331 "%c arg not in range(0x10000) "
8332 "(narrow Python build)");
8333 return -1;
8334 }
8335#endif
8336 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 }
8338 buf[1] = '\0';
8339 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008340
8341 onError:
8342 PyErr_SetString(PyExc_TypeError,
8343 "%c requires int or char");
8344 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345}
8346
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008347/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8348
8349 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8350 chars are formatted. XXX This is a magic number. Each formatting
8351 routine does bounds checking to ensure no overflow, but a better
8352 solution may be to malloc a buffer of appropriate size for each
8353 format. For now, the current solution is sufficient.
8354*/
8355#define FORMATBUFLEN (size_t)120
8356
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357PyObject *PyUnicode_Format(PyObject *format,
8358 PyObject *args)
8359{
8360 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008361 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 int args_owned = 0;
8363 PyUnicodeObject *result = NULL;
8364 PyObject *dict = NULL;
8365 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008366
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367 if (format == NULL || args == NULL) {
8368 PyErr_BadInternalCall();
8369 return NULL;
8370 }
8371 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008372 if (uformat == NULL)
8373 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374 fmt = PyUnicode_AS_UNICODE(uformat);
8375 fmtcnt = PyUnicode_GET_SIZE(uformat);
8376
8377 reslen = rescnt = fmtcnt + 100;
8378 result = _PyUnicode_New(reslen);
8379 if (result == NULL)
8380 goto onError;
8381 res = PyUnicode_AS_UNICODE(result);
8382
8383 if (PyTuple_Check(args)) {
8384 arglen = PyTuple_Size(args);
8385 argidx = 0;
8386 }
8387 else {
8388 arglen = -1;
8389 argidx = -2;
8390 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008391 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008392 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393 dict = args;
8394
8395 while (--fmtcnt >= 0) {
8396 if (*fmt != '%') {
8397 if (--rescnt < 0) {
8398 rescnt = fmtcnt + 100;
8399 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008400 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008401 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008402 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8403 --rescnt;
8404 }
8405 *res++ = *fmt++;
8406 }
8407 else {
8408 /* Got a format specifier */
8409 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008410 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412 Py_UNICODE c = '\0';
8413 Py_UNICODE fill;
8414 PyObject *v = NULL;
8415 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008416 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008418 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008419 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420
8421 fmt++;
8422 if (*fmt == '(') {
8423 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008424 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425 PyObject *key;
8426 int pcount = 1;
8427
8428 if (dict == NULL) {
8429 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008430 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431 goto onError;
8432 }
8433 ++fmt;
8434 --fmtcnt;
8435 keystart = fmt;
8436 /* Skip over balanced parentheses */
8437 while (pcount > 0 && --fmtcnt >= 0) {
8438 if (*fmt == ')')
8439 --pcount;
8440 else if (*fmt == '(')
8441 ++pcount;
8442 fmt++;
8443 }
8444 keylen = fmt - keystart - 1;
8445 if (fmtcnt < 0 || pcount > 0) {
8446 PyErr_SetString(PyExc_ValueError,
8447 "incomplete format key");
8448 goto onError;
8449 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008450#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008451 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 then looked up since Python uses strings to hold
8453 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008454 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 key = PyUnicode_EncodeUTF8(keystart,
8456 keylen,
8457 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008458#else
8459 key = PyUnicode_FromUnicode(keystart, keylen);
8460#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461 if (key == NULL)
8462 goto onError;
8463 if (args_owned) {
8464 Py_DECREF(args);
8465 args_owned = 0;
8466 }
8467 args = PyObject_GetItem(dict, key);
8468 Py_DECREF(key);
8469 if (args == NULL) {
8470 goto onError;
8471 }
8472 args_owned = 1;
8473 arglen = -1;
8474 argidx = -2;
8475 }
8476 while (--fmtcnt >= 0) {
8477 switch (c = *fmt++) {
8478 case '-': flags |= F_LJUST; continue;
8479 case '+': flags |= F_SIGN; continue;
8480 case ' ': flags |= F_BLANK; continue;
8481 case '#': flags |= F_ALT; continue;
8482 case '0': flags |= F_ZERO; continue;
8483 }
8484 break;
8485 }
8486 if (c == '*') {
8487 v = getnextarg(args, arglen, &argidx);
8488 if (v == NULL)
8489 goto onError;
8490 if (!PyInt_Check(v)) {
8491 PyErr_SetString(PyExc_TypeError,
8492 "* wants int");
8493 goto onError;
8494 }
8495 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008496 if (width == -1 && PyErr_Occurred())
8497 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 if (width < 0) {
8499 flags |= F_LJUST;
8500 width = -width;
8501 }
8502 if (--fmtcnt >= 0)
8503 c = *fmt++;
8504 }
8505 else if (c >= '0' && c <= '9') {
8506 width = c - '0';
8507 while (--fmtcnt >= 0) {
8508 c = *fmt++;
8509 if (c < '0' || c > '9')
8510 break;
8511 if ((width*10) / 10 != width) {
8512 PyErr_SetString(PyExc_ValueError,
8513 "width too big");
8514 goto onError;
8515 }
8516 width = width*10 + (c - '0');
8517 }
8518 }
8519 if (c == '.') {
8520 prec = 0;
8521 if (--fmtcnt >= 0)
8522 c = *fmt++;
8523 if (c == '*') {
8524 v = getnextarg(args, arglen, &argidx);
8525 if (v == NULL)
8526 goto onError;
8527 if (!PyInt_Check(v)) {
8528 PyErr_SetString(PyExc_TypeError,
8529 "* wants int");
8530 goto onError;
8531 }
8532 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008533 if (prec == -1 && PyErr_Occurred())
8534 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535 if (prec < 0)
8536 prec = 0;
8537 if (--fmtcnt >= 0)
8538 c = *fmt++;
8539 }
8540 else if (c >= '0' && c <= '9') {
8541 prec = c - '0';
8542 while (--fmtcnt >= 0) {
8543 c = Py_CHARMASK(*fmt++);
8544 if (c < '0' || c > '9')
8545 break;
8546 if ((prec*10) / 10 != prec) {
8547 PyErr_SetString(PyExc_ValueError,
8548 "prec too big");
8549 goto onError;
8550 }
8551 prec = prec*10 + (c - '0');
8552 }
8553 }
8554 } /* prec */
8555 if (fmtcnt >= 0) {
8556 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557 if (--fmtcnt >= 0)
8558 c = *fmt++;
8559 }
8560 }
8561 if (fmtcnt < 0) {
8562 PyErr_SetString(PyExc_ValueError,
8563 "incomplete format");
8564 goto onError;
8565 }
8566 if (c != '%') {
8567 v = getnextarg(args, arglen, &argidx);
8568 if (v == NULL)
8569 goto onError;
8570 }
8571 sign = 0;
8572 fill = ' ';
8573 switch (c) {
8574
8575 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008576 pbuf = formatbuf;
8577 /* presume that buffer length is at least 1 */
8578 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579 len = 1;
8580 break;
8581
8582 case 's':
8583 case 'r':
8584 if (PyUnicode_Check(v) && c == 's') {
8585 temp = v;
8586 Py_INCREF(temp);
8587 }
8588 else {
8589 PyObject *unicode;
8590 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008591 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 else
8593 temp = PyObject_Repr(v);
8594 if (temp == NULL)
8595 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008596 if (PyUnicode_Check(temp))
8597 /* nothing to do */;
8598 else if (PyString_Check(temp)) {
8599 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008600 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008602 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008604 Py_DECREF(temp);
8605 temp = unicode;
8606 if (temp == NULL)
8607 goto onError;
8608 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008609 else {
8610 Py_DECREF(temp);
8611 PyErr_SetString(PyExc_TypeError,
8612 "%s argument has non-string str()");
8613 goto onError;
8614 }
8615 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008616 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617 len = PyUnicode_GET_SIZE(temp);
8618 if (prec >= 0 && len > prec)
8619 len = prec;
8620 break;
8621
8622 case 'i':
8623 case 'd':
8624 case 'u':
8625 case 'o':
8626 case 'x':
8627 case 'X':
8628 if (c == 'i')
8629 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008630 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008631 temp = formatlong(v, flags, prec, c);
8632 if (!temp)
8633 goto onError;
8634 pbuf = PyUnicode_AS_UNICODE(temp);
8635 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008636 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008638 else {
8639 pbuf = formatbuf;
8640 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8641 flags, prec, c, v);
8642 if (len < 0)
8643 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008644 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008645 }
8646 if (flags & F_ZERO)
8647 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 break;
8649
8650 case 'e':
8651 case 'E':
8652 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008653 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654 case 'g':
8655 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008656 if (c == 'F')
8657 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008658 pbuf = formatbuf;
8659 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8660 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661 if (len < 0)
8662 goto onError;
8663 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008664 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 fill = '0';
8666 break;
8667
8668 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008669 pbuf = formatbuf;
8670 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 if (len < 0)
8672 goto onError;
8673 break;
8674
8675 default:
8676 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008677 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008678 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008679 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008680 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008681 (Py_ssize_t)(fmt - 1 -
8682 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 goto onError;
8684 }
8685 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008686 if (*pbuf == '-' || *pbuf == '+') {
8687 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688 len--;
8689 }
8690 else if (flags & F_SIGN)
8691 sign = '+';
8692 else if (flags & F_BLANK)
8693 sign = ' ';
8694 else
8695 sign = 0;
8696 }
8697 if (width < len)
8698 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008699 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 reslen -= rescnt;
8701 rescnt = width + fmtcnt + 100;
8702 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008703 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008704 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008705 PyErr_NoMemory();
8706 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008707 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008708 if (_PyUnicode_Resize(&result, reslen) < 0) {
8709 Py_XDECREF(temp);
8710 goto onError;
8711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 res = PyUnicode_AS_UNICODE(result)
8713 + reslen - rescnt;
8714 }
8715 if (sign) {
8716 if (fill != ' ')
8717 *res++ = sign;
8718 rescnt--;
8719 if (width > len)
8720 width--;
8721 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008722 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008723 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008724 assert(pbuf[1] == c);
8725 if (fill != ' ') {
8726 *res++ = *pbuf++;
8727 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008728 }
Tim Petersfff53252001-04-12 18:38:48 +00008729 rescnt -= 2;
8730 width -= 2;
8731 if (width < 0)
8732 width = 0;
8733 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008734 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735 if (width > len && !(flags & F_LJUST)) {
8736 do {
8737 --rescnt;
8738 *res++ = fill;
8739 } while (--width > len);
8740 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008741 if (fill == ' ') {
8742 if (sign)
8743 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008744 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008745 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008746 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008747 *res++ = *pbuf++;
8748 *res++ = *pbuf++;
8749 }
8750 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008751 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752 res += len;
8753 rescnt -= len;
8754 while (--width >= len) {
8755 --rescnt;
8756 *res++ = ' ';
8757 }
8758 if (dict && (argidx < arglen) && c != '%') {
8759 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008760 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008761 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762 goto onError;
8763 }
8764 Py_XDECREF(temp);
8765 } /* '%' */
8766 } /* until end */
8767 if (argidx < arglen && !dict) {
8768 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008769 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770 goto onError;
8771 }
8772
Thomas Woutersa96affe2006-03-12 00:29:36 +00008773 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8774 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775 if (args_owned) {
8776 Py_DECREF(args);
8777 }
8778 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779 return (PyObject *)result;
8780
8781 onError:
8782 Py_XDECREF(result);
8783 Py_DECREF(uformat);
8784 if (args_owned) {
8785 Py_DECREF(args);
8786 }
8787 return NULL;
8788}
8789
8790static PyBufferProcs unicode_as_buffer = {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00008791 (getbufferproc) unicode_buffer_getbuffer,
8792 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793};
8794
Jeremy Hylton938ace62002-07-17 16:30:39 +00008795static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008796unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8797
Tim Peters6d6c1a32001-08-02 04:15:00 +00008798static PyObject *
8799unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8800{
8801 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008802 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008803 char *encoding = NULL;
8804 char *errors = NULL;
8805
Guido van Rossume023fe02001-08-30 03:12:59 +00008806 if (type != &PyUnicode_Type)
8807 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008808 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8809 kwlist, &x, &encoding, &errors))
8810 return NULL;
8811 if (x == NULL)
8812 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008813 if (encoding == NULL && errors == NULL)
8814 return PyObject_Unicode(x);
8815 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008816 return PyUnicode_FromEncodedObject(x, encoding, errors);
8817}
8818
Guido van Rossume023fe02001-08-30 03:12:59 +00008819static PyObject *
8820unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8821{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008822 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008823 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008824
8825 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8826 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8827 if (tmp == NULL)
8828 return NULL;
8829 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008830 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008831 if (pnew == NULL) {
8832 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008833 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008834 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008835 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8836 if (pnew->str == NULL) {
8837 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008838 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008839 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008840 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008841 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008842 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8843 pnew->length = n;
8844 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008845 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008846 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008847}
8848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008849PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008850"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008851\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008852Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008853encoding defaults to the current default string encoding.\n\
8854errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008855
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008856static PyObject *unicode_iter(PyObject *seq);
8857
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008859 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008860 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861 sizeof(PyUnicodeObject), /* tp_size */
8862 0, /* tp_itemsize */
8863 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008864 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008866 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008868 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008869 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008870 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008872 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873 (hashfunc) unicode_hash, /* tp_hash*/
8874 0, /* tp_call*/
8875 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008876 PyObject_GenericGetAttr, /* tp_getattro */
8877 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008879 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8880 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008881 unicode_doc, /* tp_doc */
8882 0, /* tp_traverse */
8883 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008884 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008885 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008886 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008887 0, /* tp_iternext */
8888 unicode_methods, /* tp_methods */
8889 0, /* tp_members */
8890 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008891 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008892 0, /* tp_dict */
8893 0, /* tp_descr_get */
8894 0, /* tp_descr_set */
8895 0, /* tp_dictoffset */
8896 0, /* tp_init */
8897 0, /* tp_alloc */
8898 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008899 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900};
8901
8902/* Initialize the Unicode implementation */
8903
Thomas Wouters78890102000-07-22 19:25:51 +00008904void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008906 int i;
8907
Thomas Wouters477c8d52006-05-27 19:21:47 +00008908 /* XXX - move this array to unicodectype.c ? */
8909 Py_UNICODE linebreak[] = {
8910 0x000A, /* LINE FEED */
8911 0x000D, /* CARRIAGE RETURN */
8912 0x001C, /* FILE SEPARATOR */
8913 0x001D, /* GROUP SEPARATOR */
8914 0x001E, /* RECORD SEPARATOR */
8915 0x0085, /* NEXT LINE */
8916 0x2028, /* LINE SEPARATOR */
8917 0x2029, /* PARAGRAPH SEPARATOR */
8918 };
8919
Fred Drakee4315f52000-05-09 19:53:39 +00008920 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008921 unicode_freelist = NULL;
8922 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008924 if (!unicode_empty)
8925 return;
8926
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008927 for (i = 0; i < 256; i++)
8928 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008929 if (PyType_Ready(&PyUnicode_Type) < 0)
8930 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008931
8932 /* initialize the linebreak bloom filter */
8933 bloom_linebreak = make_bloom_mask(
8934 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8935 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008936
8937 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938}
8939
8940/* Finalize the Unicode implementation */
8941
8942void
Thomas Wouters78890102000-07-22 19:25:51 +00008943_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008945 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008946 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008948 Py_XDECREF(unicode_empty);
8949 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008950
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008951 for (i = 0; i < 256; i++) {
8952 if (unicode_latin1[i]) {
8953 Py_DECREF(unicode_latin1[i]);
8954 unicode_latin1[i] = NULL;
8955 }
8956 }
8957
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008958 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959 PyUnicodeObject *v = u;
8960 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008961 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008962 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008963 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008964 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008966 unicode_freelist = NULL;
8967 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008969
Walter Dörwald16807132007-05-25 13:52:07 +00008970void
8971PyUnicode_InternInPlace(PyObject **p)
8972{
8973 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8974 PyObject *t;
8975 if (s == NULL || !PyUnicode_Check(s))
8976 Py_FatalError(
8977 "PyUnicode_InternInPlace: unicode strings only please!");
8978 /* If it's a subclass, we don't really know what putting
8979 it in the interned dict might do. */
8980 if (!PyUnicode_CheckExact(s))
8981 return;
8982 if (PyUnicode_CHECK_INTERNED(s))
8983 return;
8984 if (interned == NULL) {
8985 interned = PyDict_New();
8986 if (interned == NULL) {
8987 PyErr_Clear(); /* Don't leave an exception */
8988 return;
8989 }
8990 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008991 /* It might be that the GetItem call fails even
8992 though the key is present in the dictionary,
8993 namely when this happens during a stack overflow. */
8994 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008995 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008996 Py_END_ALLOW_RECURSION
8997
Walter Dörwald16807132007-05-25 13:52:07 +00008998 if (t) {
8999 Py_INCREF(t);
9000 Py_DECREF(*p);
9001 *p = t;
9002 return;
9003 }
9004
Martin v. Löwis5b222132007-06-10 09:51:05 +00009005 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009006 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9007 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009008 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009009 return;
9010 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009011 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009012 /* The two references in interned are not counted by refcnt.
9013 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009014 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009015 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9016}
9017
9018void
9019PyUnicode_InternImmortal(PyObject **p)
9020{
9021 PyUnicode_InternInPlace(p);
9022 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9023 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9024 Py_INCREF(*p);
9025 }
9026}
9027
9028PyObject *
9029PyUnicode_InternFromString(const char *cp)
9030{
9031 PyObject *s = PyUnicode_FromString(cp);
9032 if (s == NULL)
9033 return NULL;
9034 PyUnicode_InternInPlace(&s);
9035 return s;
9036}
9037
9038void _Py_ReleaseInternedUnicodeStrings(void)
9039{
9040 PyObject *keys;
9041 PyUnicodeObject *s;
9042 Py_ssize_t i, n;
9043 Py_ssize_t immortal_size = 0, mortal_size = 0;
9044
9045 if (interned == NULL || !PyDict_Check(interned))
9046 return;
9047 keys = PyDict_Keys(interned);
9048 if (keys == NULL || !PyList_Check(keys)) {
9049 PyErr_Clear();
9050 return;
9051 }
9052
9053 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9054 detector, interned unicode strings are not forcibly deallocated;
9055 rather, we give them their stolen references back, and then clear
9056 and DECREF the interned dict. */
9057
9058 n = PyList_GET_SIZE(keys);
9059 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9060 n);
9061 for (i = 0; i < n; i++) {
9062 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9063 switch (s->state) {
9064 case SSTATE_NOT_INTERNED:
9065 /* XXX Shouldn't happen */
9066 break;
9067 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009068 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009069 immortal_size += s->length;
9070 break;
9071 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009072 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009073 mortal_size += s->length;
9074 break;
9075 default:
9076 Py_FatalError("Inconsistent interned string state.");
9077 }
9078 s->state = SSTATE_NOT_INTERNED;
9079 }
9080 fprintf(stderr, "total size of all interned strings: "
9081 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9082 "mortal/immortal\n", mortal_size, immortal_size);
9083 Py_DECREF(keys);
9084 PyDict_Clear(interned);
9085 Py_DECREF(interned);
9086 interned = NULL;
9087}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009088
9089
9090/********************* Unicode Iterator **************************/
9091
9092typedef struct {
9093 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009094 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009095 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9096} unicodeiterobject;
9097
9098static void
9099unicodeiter_dealloc(unicodeiterobject *it)
9100{
9101 _PyObject_GC_UNTRACK(it);
9102 Py_XDECREF(it->it_seq);
9103 PyObject_GC_Del(it);
9104}
9105
9106static int
9107unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9108{
9109 Py_VISIT(it->it_seq);
9110 return 0;
9111}
9112
9113static PyObject *
9114unicodeiter_next(unicodeiterobject *it)
9115{
9116 PyUnicodeObject *seq;
9117 PyObject *item;
9118
9119 assert(it != NULL);
9120 seq = it->it_seq;
9121 if (seq == NULL)
9122 return NULL;
9123 assert(PyUnicode_Check(seq));
9124
9125 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009126 item = PyUnicode_FromUnicode(
9127 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009128 if (item != NULL)
9129 ++it->it_index;
9130 return item;
9131 }
9132
9133 Py_DECREF(seq);
9134 it->it_seq = NULL;
9135 return NULL;
9136}
9137
9138static PyObject *
9139unicodeiter_len(unicodeiterobject *it)
9140{
9141 Py_ssize_t len = 0;
9142 if (it->it_seq)
9143 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9144 return PyInt_FromSsize_t(len);
9145}
9146
9147PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9148
9149static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009150 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9151 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009152 {NULL, NULL} /* sentinel */
9153};
9154
9155PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009156 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009157 "unicodeiterator", /* tp_name */
9158 sizeof(unicodeiterobject), /* tp_basicsize */
9159 0, /* tp_itemsize */
9160 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009161 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009162 0, /* tp_print */
9163 0, /* tp_getattr */
9164 0, /* tp_setattr */
9165 0, /* tp_compare */
9166 0, /* tp_repr */
9167 0, /* tp_as_number */
9168 0, /* tp_as_sequence */
9169 0, /* tp_as_mapping */
9170 0, /* tp_hash */
9171 0, /* tp_call */
9172 0, /* tp_str */
9173 PyObject_GenericGetAttr, /* tp_getattro */
9174 0, /* tp_setattro */
9175 0, /* tp_as_buffer */
9176 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9177 0, /* tp_doc */
9178 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9179 0, /* tp_clear */
9180 0, /* tp_richcompare */
9181 0, /* tp_weaklistoffset */
9182 PyObject_SelfIter, /* tp_iter */
9183 (iternextfunc)unicodeiter_next, /* tp_iternext */
9184 unicodeiter_methods, /* tp_methods */
9185 0,
9186};
9187
9188static PyObject *
9189unicode_iter(PyObject *seq)
9190{
9191 unicodeiterobject *it;
9192
9193 if (!PyUnicode_Check(seq)) {
9194 PyErr_BadInternalCall();
9195 return NULL;
9196 }
9197 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9198 if (it == NULL)
9199 return NULL;
9200 it->it_index = 0;
9201 Py_INCREF(seq);
9202 it->it_seq = (PyUnicodeObject *)seq;
9203 _PyObject_GC_TRACK(it);
9204 return (PyObject *)it;
9205}
9206
Martin v. Löwis5b222132007-06-10 09:51:05 +00009207size_t
9208Py_UNICODE_strlen(const Py_UNICODE *u)
9209{
9210 int res = 0;
9211 while(*u++)
9212 res++;
9213 return res;
9214}
9215
9216Py_UNICODE*
9217Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9218{
9219 Py_UNICODE *u = s1;
9220 while ((*u++ = *s2++));
9221 return s1;
9222}
9223
9224Py_UNICODE*
9225Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9226{
9227 Py_UNICODE *u = s1;
9228 while ((*u++ = *s2++))
9229 if (n-- == 0)
9230 break;
9231 return s1;
9232}
9233
9234int
9235Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9236{
9237 while (*s1 && *s2 && *s1 == *s2)
9238 s1++, s2++;
9239 if (*s1 && *s2)
9240 return (*s1 < *s2) ? -1 : +1;
9241 if (*s1)
9242 return 1;
9243 if (*s2)
9244 return -1;
9245 return 0;
9246}
9247
9248Py_UNICODE*
9249Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9250{
9251 const Py_UNICODE *p;
9252 for (p = s; *p; p++)
9253 if (*p == c)
9254 return (Py_UNICODE*)p;
9255 return NULL;
9256}
9257
9258
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009259#ifdef __cplusplus
9260}
9261#endif
9262
9263
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009264/*
9265Local variables:
9266c-basic-offset: 4
9267indent-tabs-mode: nil
9268End:
9269*/