blob: 23268f9d130185ae74313ee522e6e2b99eb3f9ca [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Eric Smith8c663262007-08-25 02:26:07 +000049#include "formatter_unicode.h"
50
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000051#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000052#include <windows.h>
53#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000054
Guido van Rossumd57fd912000-03-10 22:53:23 +000055/* Limit for the Unicode object free list */
56
57#define MAX_UNICODE_FREELIST_SIZE 1024
58
59/* Limit for the Unicode object free list stay alive optimization.
60
61 The implementation will keep allocated Unicode memory intact for
62 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000063 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Barry Warsaw51ac5802000-03-20 16:36:48 +000065 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000067 malloc()-overhead) bytes of unused garbage.
68
69 Setting the limit to 0 effectively turns the feature off.
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071 Note: This is an experimental feature ! If you get core dumps when
72 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000073
74*/
75
Guido van Rossumfd4b9572000-04-10 13:51:10 +000076#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
78/* Endianness switches; defaults to little endian */
79
80#ifdef WORDS_BIGENDIAN
81# define BYTEORDER_IS_BIG_ENDIAN
82#else
83# define BYTEORDER_IS_LITTLE_ENDIAN
84#endif
85
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086/* --- Globals ------------------------------------------------------------
87
88 The globals are initialized by the _PyUnicode_Init() API and should
89 not be used before calling that API.
90
91*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093
94#ifdef __cplusplus
95extern "C" {
96#endif
97
Walter Dörwald16807132007-05-25 13:52:07 +000098/* This dictionary holds all interned unicode strings. Note that references
99 to strings in this dictionary are *not* counted in the string's ob_refcnt.
100 When the interned string reaches a refcnt of 0 the string deallocation
101 function will delete the reference from this dictionary.
102
103 Another way to look at this is that to say that the actual reference
104 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
105*/
106static PyObject *interned;
107
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000109static PyUnicodeObject *unicode_freelist;
110static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000112/* The empty Unicode object is shared to improve performance. */
113static PyUnicodeObject *unicode_empty;
114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117static PyUnicodeObject *unicode_latin1[256];
118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000120 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000121 PyUnicode_GetDefaultEncoding() API to access this global.
122
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000123 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000124 hard coded default!
125*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000126static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000128Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000129PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000130{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000131#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000132 return 0x10FFFF;
133#else
134 /* This is actually an illegal character, so it should
135 not be passed to unichr. */
136 return 0xFFFF;
137#endif
138}
139
Thomas Wouters477c8d52006-05-27 19:21:47 +0000140/* --- Bloom Filters ----------------------------------------------------- */
141
142/* stuff to implement simple "bloom filters" for Unicode characters.
143 to keep things simple, we use a single bitmask, using the least 5
144 bits from each unicode characters as the bit index. */
145
146/* the linebreak mask is set up by Unicode_Init below */
147
148#define BLOOM_MASK unsigned long
149
150static BLOOM_MASK bloom_linebreak;
151
152#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
153
154#define BLOOM_LINEBREAK(ch)\
155 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
156
157Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
158{
159 /* calculate simple bloom-style bitmask for a given unicode string */
160
161 long mask;
162 Py_ssize_t i;
163
164 mask = 0;
165 for (i = 0; i < len; i++)
166 mask |= (1 << (ptr[i] & 0x1F));
167
168 return mask;
169}
170
171Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
172{
173 Py_ssize_t i;
174
175 for (i = 0; i < setlen; i++)
176 if (set[i] == chr)
177 return 1;
178
179 return 0;
180}
181
182#define BLOOM_MEMBER(mask, chr, set, setlen)\
183 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
184
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185/* --- Unicode Object ----------------------------------------------------- */
186
187static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000189 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190{
191 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000192
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000193 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 /* Resizing shared object (unicode_empty or single character
198 objects) in-place is not allowed. Use PyUnicode_Resize()
199 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 if (unicode == unicode_empty ||
202 (unicode->length == 1 &&
203 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000206 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 return -1;
208 }
209
Thomas Wouters477c8d52006-05-27 19:21:47 +0000210 /* We allocate one more byte to make sure the string is Ux0000 terminated.
211 The overallocation is also used by fastsearch, which assumes that it's
212 safe to look at str[length] (without making any assumptions about what
213 it contains). */
214
Guido van Rossumd57fd912000-03-10 22:53:23 +0000215 oldstr = unicode->str;
216 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
217 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 PyErr_NoMemory();
220 return -1;
221 }
222 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000223 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000225 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000227 if (unicode->defenc) {
228 Py_DECREF(unicode->defenc);
229 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 }
231 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000232
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 return 0;
234}
235
236/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000237 Ux0000 terminated; some code (e.g. new_identifier)
238 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239
240 XXX This allocator could further be enhanced by assuring that the
241 free list never reduces its size below 1.
242
243*/
244
245static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 register PyUnicodeObject *unicode;
249
Thomas Wouters477c8d52006-05-27 19:21:47 +0000250 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (length == 0 && unicode_empty != NULL) {
252 Py_INCREF(unicode_empty);
253 return unicode_empty;
254 }
255
256 /* Unicode freelist & memory allocation */
257 if (unicode_freelist) {
258 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000259 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Keep-Alive optimization: we only upsize the buffer,
263 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000264 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000265 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000266 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 }
269 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000270 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000272 }
273 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 }
275 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000276 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (unicode == NULL)
278 return NULL;
279 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
280 }
281
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000282 if (!unicode->str) {
283 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000284 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000285 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000286 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000287 * the caller fails before initializing str -- unicode_resize()
288 * reads str[0], and the Keep-Alive optimization can keep memory
289 * allocated for str alive across a call to unicode_dealloc(unicode).
290 * We don't want unicode_resize to read uninitialized memory in
291 * that case.
292 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000293 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000297 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000298 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000300
301 onError:
302 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000303 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305}
306
307static
Guido van Rossum9475a232001-10-05 20:51:39 +0000308void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309{
Walter Dörwald16807132007-05-25 13:52:07 +0000310 switch (PyUnicode_CHECK_INTERNED(unicode)) {
311 case SSTATE_NOT_INTERNED:
312 break;
313
314 case SSTATE_INTERNED_MORTAL:
315 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000316 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000317 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
318 Py_FatalError(
319 "deletion of interned unicode string failed");
320 break;
321
322 case SSTATE_INTERNED_IMMORTAL:
323 Py_FatalError("Immortal interned unicode string died.");
324
325 default:
326 Py_FatalError("Inconsistent interned unicode string state.");
327 }
328
Guido van Rossum604ddf82001-12-06 20:03:56 +0000329 if (PyUnicode_CheckExact(unicode) &&
330 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000331 /* Keep-Alive optimization */
332 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000333 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 unicode->str = NULL;
335 unicode->length = 0;
336 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000337 if (unicode->defenc) {
338 Py_DECREF(unicode->defenc);
339 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000340 }
341 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 *(PyUnicodeObject **)unicode = unicode_freelist;
343 unicode_freelist = unicode;
344 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000347 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000348 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000349 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 }
351}
352
Martin v. Löwis18e16552006-02-15 17:27:45 +0000353int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000354{
355 register PyUnicodeObject *v;
356
357 /* Argument checks */
358 if (unicode == NULL) {
359 PyErr_BadInternalCall();
360 return -1;
361 }
362 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000363 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 PyErr_BadInternalCall();
365 return -1;
366 }
367
368 /* Resizing unicode_empty and single character objects is not
369 possible since these are being shared. We simply return a fresh
370 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000371 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000372 (v == unicode_empty || v->length == 1)) {
373 PyUnicodeObject *w = _PyUnicode_New(length);
374 if (w == NULL)
375 return -1;
376 Py_UNICODE_COPY(w->str, v->str,
377 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000378 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 *unicode = (PyObject *)w;
380 return 0;
381 }
382
383 /* Note that we don't have to modify *unicode for unshared Unicode
384 objects, since we can modify them in-place. */
385 return unicode_resize(v, length);
386}
387
388/* Internal API for use in unicodeobject.c only ! */
389#define _PyUnicode_Resize(unicodevar, length) \
390 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
391
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000393 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394{
395 PyUnicodeObject *unicode;
396
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000397 /* If the Unicode data is known at construction time, we can apply
398 some optimizations which share commonly used objects. */
399 if (u != NULL) {
400
401 /* Optimization for empty strings */
402 if (size == 0 && unicode_empty != NULL) {
403 Py_INCREF(unicode_empty);
404 return (PyObject *)unicode_empty;
405 }
406
407 /* Single character Unicode objects in the Latin-1 range are
408 shared when using this constructor */
409 if (size == 1 && *u < 256) {
410 unicode = unicode_latin1[*u];
411 if (!unicode) {
412 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000413 if (!unicode)
414 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000415 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000416 unicode_latin1[*u] = unicode;
417 }
418 Py_INCREF(unicode);
419 return (PyObject *)unicode;
420 }
421 }
Tim Petersced69f82003-09-16 20:30:58 +0000422
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 unicode = _PyUnicode_New(size);
424 if (!unicode)
425 return NULL;
426
427 /* Copy the Unicode data into the new object */
428 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430
431 return (PyObject *)unicode;
432}
433
Walter Dörwaldd2034312007-05-18 16:29:38 +0000434PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000435{
436 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000437 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000438 some optimizations which share commonly used objects.
439 Also, this means the input must be UTF-8, so fall back to the
440 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000441 if (u != NULL) {
442
443 /* Optimization for empty strings */
444 if (size == 0 && unicode_empty != NULL) {
445 Py_INCREF(unicode_empty);
446 return (PyObject *)unicode_empty;
447 }
448
Martin v. Löwis9c121062007-08-05 20:26:11 +0000449 /* Single characters are shared when using this constructor.
450 Restrict to ASCII, since the input must be UTF-8. */
451 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000452 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000453 if (!unicode) {
454 unicode = _PyUnicode_New(1);
455 if (!unicode)
456 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000457 unicode->str[0] = Py_CHARMASK(*u);
458 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000459 }
460 Py_INCREF(unicode);
461 return (PyObject *)unicode;
462 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000463
464 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000465 }
466
Walter Dörwald55507312007-05-18 13:12:10 +0000467 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000468 if (!unicode)
469 return NULL;
470
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000471 return (PyObject *)unicode;
472}
473
Walter Dörwaldd2034312007-05-18 16:29:38 +0000474PyObject *PyUnicode_FromString(const char *u)
475{
476 size_t size = strlen(u);
477 if (size > PY_SSIZE_T_MAX) {
478 PyErr_SetString(PyExc_OverflowError, "input too long");
479 return NULL;
480 }
481
482 return PyUnicode_FromStringAndSize(u, size);
483}
484
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485#ifdef HAVE_WCHAR_H
486
487PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000488 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489{
490 PyUnicodeObject *unicode;
491
492 if (w == NULL) {
493 PyErr_BadInternalCall();
494 return NULL;
495 }
496
497 unicode = _PyUnicode_New(size);
498 if (!unicode)
499 return NULL;
500
501 /* Copy the wchar_t data into the new object */
502#ifdef HAVE_USABLE_WCHAR_T
503 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000504#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 {
506 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000507 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000509 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510 *u++ = *w++;
511 }
512#endif
513
514 return (PyObject *)unicode;
515}
516
Walter Dörwald346737f2007-05-31 10:44:43 +0000517static void
518makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
519{
520 *fmt++ = '%';
521 if (width) {
522 if (zeropad)
523 *fmt++ = '0';
524 fmt += sprintf(fmt, "%d", width);
525 }
526 if (precision)
527 fmt += sprintf(fmt, ".%d", precision);
528 if (longflag)
529 *fmt++ = 'l';
530 else if (size_tflag) {
531 char *f = PY_FORMAT_SIZE_T;
532 while (*f)
533 *fmt++ = *f++;
534 }
535 *fmt++ = c;
536 *fmt = '\0';
537}
538
Walter Dörwaldd2034312007-05-18 16:29:38 +0000539#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
540
541PyObject *
542PyUnicode_FromFormatV(const char *format, va_list vargs)
543{
544 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000545 Py_ssize_t callcount = 0;
546 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000547 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000548 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000549 int width = 0;
550 int precision = 0;
551 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000552 const char* f;
553 Py_UNICODE *s;
554 PyObject *string;
555 /* used by sprintf */
556 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000557 /* use abuffer instead of buffer, if we need more space
558 * (which can happen if there's a format specifier with width). */
559 char *abuffer = NULL;
560 char *realbuffer;
561 Py_ssize_t abuffersize = 0;
562 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000563 const char *copy;
564
565#ifdef VA_LIST_IS_ARRAY
566 Py_MEMCPY(count, vargs, sizeof(va_list));
567#else
568#ifdef __va_copy
569 __va_copy(count, vargs);
570#else
571 count = vargs;
572#endif
573#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 /* step 1: count the number of %S/%R format specifications
575 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
576 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000577 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000578 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000579 ++callcount;
580 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000581 /* step 2: allocate memory for the results of
582 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000583 if (callcount) {
584 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
585 if (!callresults) {
586 PyErr_NoMemory();
587 return NULL;
588 }
589 callresult = callresults;
590 }
591 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000592 for (f = format; *f; f++) {
593 if (*f == '%') {
594 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000595 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000596 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000597 width = (width*10) + *f++ - '0';
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000598 while (*++f && *f != '%' && !ISALPHA(*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000599 ;
600
601 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
602 * they don't affect the amount of space we reserve.
603 */
604 if ((*f == 'l' || *f == 'z') &&
605 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000606 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000607
608 switch (*f) {
609 case 'c':
610 (void)va_arg(count, int);
611 /* fall through... */
612 case '%':
613 n++;
614 break;
615 case 'd': case 'u': case 'i': case 'x':
616 (void) va_arg(count, int);
617 /* 20 bytes is enough to hold a 64-bit
618 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000619 This isn't enough for octal.
620 If a width is specified we need more
621 (which we allocate later). */
622 if (width < 20)
623 width = 20;
624 n += width;
625 if (abuffersize < width)
626 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000627 break;
628 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000629 {
630 /* UTF-8 */
631 unsigned char*s;
632 s = va_arg(count, unsigned char*);
633 while (*s) {
634 if (*s < 128) {
635 n++; s++;
636 } else if (*s < 0xc0) {
637 /* invalid UTF-8 */
638 n++; s++;
639 } else if (*s < 0xc0) {
640 n++;
641 s++; if(!*s)break;
642 s++;
643 } else if (*s < 0xe0) {
644 n++;
645 s++; if(!*s)break;
646 s++; if(!*s)break;
647 s++;
648 } else {
649 #ifdef Py_UNICODE_WIDE
650 n++;
651 #else
652 n+=2;
653 #endif
654 s++; if(!*s)break;
655 s++; if(!*s)break;
656 s++; if(!*s)break;
657 s++;
658 }
659 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000660 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000661 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000662 case 'U':
663 {
664 PyObject *obj = va_arg(count, PyObject *);
665 assert(obj && PyUnicode_Check(obj));
666 n += PyUnicode_GET_SIZE(obj);
667 break;
668 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000669 case 'V':
670 {
671 PyObject *obj = va_arg(count, PyObject *);
672 const char *str = va_arg(count, const char *);
673 assert(obj || str);
674 assert(!obj || PyUnicode_Check(obj));
675 if (obj)
676 n += PyUnicode_GET_SIZE(obj);
677 else
678 n += strlen(str);
679 break;
680 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000681 case 'S':
682 {
683 PyObject *obj = va_arg(count, PyObject *);
684 PyObject *str;
685 assert(obj);
686 str = PyObject_Unicode(obj);
687 if (!str)
688 goto fail;
689 n += PyUnicode_GET_SIZE(str);
690 /* Remember the str and switch to the next slot */
691 *callresult++ = str;
692 break;
693 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000694 case 'R':
695 {
696 PyObject *obj = va_arg(count, PyObject *);
697 PyObject *repr;
698 assert(obj);
699 repr = PyObject_Repr(obj);
700 if (!repr)
701 goto fail;
702 n += PyUnicode_GET_SIZE(repr);
703 /* Remember the repr and switch to the next slot */
704 *callresult++ = repr;
705 break;
706 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000707 case 'p':
708 (void) va_arg(count, int);
709 /* maximum 64-bit pointer representation:
710 * 0xffffffffffffffff
711 * so 19 characters is enough.
712 * XXX I count 18 -- what's the extra for?
713 */
714 n += 19;
715 break;
716 default:
717 /* if we stumble upon an unknown
718 formatting code, copy the rest of
719 the format string to the output
720 string. (we cannot just skip the
721 code, since there's no way to know
722 what's in the argument list) */
723 n += strlen(p);
724 goto expand;
725 }
726 } else
727 n++;
728 }
729 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000730 if (abuffersize > 20) {
731 abuffer = PyMem_Malloc(abuffersize);
732 if (!abuffer) {
733 PyErr_NoMemory();
734 goto fail;
735 }
736 realbuffer = abuffer;
737 }
738 else
739 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000740 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000741 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000742 we don't have to resize the string.
743 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 string = PyUnicode_FromUnicode(NULL, n);
745 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000746 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000747
748 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000749 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750
751 for (f = format; *f; f++) {
752 if (*f == '%') {
753 const char* p = f++;
754 int longflag = 0;
755 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000756 zeropad = (*f == '0');
757 /* parse the width.precision part */
758 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000759 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 width = (width*10) + *f++ - '0';
761 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 if (*f == '.') {
763 f++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000764 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767 /* handle the long flag, but only for %ld and %lu.
768 others can be added when necessary. */
769 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
770 longflag = 1;
771 ++f;
772 }
773 /* handle the size_t flag. */
774 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
775 size_tflag = 1;
776 ++f;
777 }
778
779 switch (*f) {
780 case 'c':
781 *s++ = va_arg(vargs, int);
782 break;
783 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000784 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000785 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000786 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000787 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000788 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000789 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000790 sprintf(realbuffer, fmt, va_arg(vargs, int));
791 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000792 break;
793 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000794 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000795 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000796 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000798 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000799 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000800 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
801 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000802 break;
803 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000804 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
805 sprintf(realbuffer, fmt, va_arg(vargs, int));
806 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000807 break;
808 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000809 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
810 sprintf(realbuffer, fmt, va_arg(vargs, int));
811 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000812 break;
813 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000814 {
815 /* Parameter must be UTF-8 encoded.
816 In case of encoding errors, use
817 the replacement character. */
818 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000819 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000820 u = PyUnicode_DecodeUTF8(p, strlen(p),
821 "replace");
822 if (!u)
823 goto fail;
824 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
825 PyUnicode_GET_SIZE(u));
826 s += PyUnicode_GET_SIZE(u);
827 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000829 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000830 case 'U':
831 {
832 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000833 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
834 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
835 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000836 break;
837 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000838 case 'V':
839 {
840 PyObject *obj = va_arg(vargs, PyObject *);
841 const char *str = va_arg(vargs, const char *);
842 if (obj) {
843 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
844 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
845 s += size;
846 } else {
847 appendstring(str);
848 }
849 break;
850 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000851 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000852 case 'R':
853 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000854 Py_UNICODE *ucopy;
855 Py_ssize_t usize;
856 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000857 /* unused, since we already have the result */
858 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000859 ucopy = PyUnicode_AS_UNICODE(*callresult);
860 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000861 for (upos = 0; upos<usize;)
862 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000863 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000864 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000865 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000866 ++callresult;
867 break;
868 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000869 case 'p':
870 sprintf(buffer, "%p", va_arg(vargs, void*));
871 /* %p is ill-defined: ensure leading 0x. */
872 if (buffer[1] == 'X')
873 buffer[1] = 'x';
874 else if (buffer[1] != 'x') {
875 memmove(buffer+2, buffer, strlen(buffer)+1);
876 buffer[0] = '0';
877 buffer[1] = 'x';
878 }
879 appendstring(buffer);
880 break;
881 case '%':
882 *s++ = '%';
883 break;
884 default:
885 appendstring(p);
886 goto end;
887 }
888 } else
889 *s++ = *f;
890 }
891
892 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000893 if (callresults)
894 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000895 if (abuffer)
896 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000897 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
898 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000899 fail:
900 if (callresults) {
901 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000902 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000903 Py_DECREF(*callresult2);
904 ++callresult2;
905 }
906 PyMem_Free(callresults);
907 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000908 if (abuffer)
909 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000910 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000911}
912
913#undef appendstring
914
915PyObject *
916PyUnicode_FromFormat(const char *format, ...)
917{
918 PyObject* ret;
919 va_list vargs;
920
921#ifdef HAVE_STDARG_PROTOTYPES
922 va_start(vargs, format);
923#else
924 va_start(vargs);
925#endif
926 ret = PyUnicode_FromFormatV(format, vargs);
927 va_end(vargs);
928 return ret;
929}
930
Martin v. Löwis18e16552006-02-15 17:27:45 +0000931Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
932 wchar_t *w,
933 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934{
935 if (unicode == NULL) {
936 PyErr_BadInternalCall();
937 return -1;
938 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000939
940 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000942 size = PyUnicode_GET_SIZE(unicode) + 1;
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944#ifdef HAVE_USABLE_WCHAR_T
945 memcpy(w, unicode->str, size * sizeof(wchar_t));
946#else
947 {
948 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000949 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000950 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000951 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000952 *w++ = *u++;
953 }
954#endif
955
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000956 if (size > PyUnicode_GET_SIZE(unicode))
957 return PyUnicode_GET_SIZE(unicode);
958 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000959 return size;
960}
961
962#endif
963
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000964PyObject *PyUnicode_FromOrdinal(int ordinal)
965{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000966 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000967
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000968 if (ordinal < 0 || ordinal > 0x10ffff) {
969 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000970 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000971 return NULL;
972 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000973
974#ifndef Py_UNICODE_WIDE
975 if (ordinal > 0xffff) {
976 ordinal -= 0x10000;
977 s[0] = 0xD800 | (ordinal >> 10);
978 s[1] = 0xDC00 | (ordinal & 0x3FF);
979 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000980 }
981#endif
982
Hye-Shik Chang40574832004-04-06 07:24:51 +0000983 s[0] = (Py_UNICODE)ordinal;
984 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000985}
986
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987PyObject *PyUnicode_FromObject(register PyObject *obj)
988{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000989 /* XXX Perhaps we should make this API an alias of
990 PyObject_Unicode() instead ?! */
991 if (PyUnicode_CheckExact(obj)) {
992 Py_INCREF(obj);
993 return obj;
994 }
995 if (PyUnicode_Check(obj)) {
996 /* For a Unicode subtype that's not a Unicode object,
997 return a true Unicode object with the same data. */
998 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
999 PyUnicode_GET_SIZE(obj));
1000 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001001 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1002}
1003
1004PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1005 const char *encoding,
1006 const char *errors)
1007{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001008 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001009 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001010 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001011
Guido van Rossumd57fd912000-03-10 22:53:23 +00001012 if (obj == NULL) {
1013 PyErr_BadInternalCall();
1014 return NULL;
1015 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001016
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001017 if (PyUnicode_Check(obj)) {
1018 PyErr_SetString(PyExc_TypeError,
1019 "decoding Unicode is not supported");
1020 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001021 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001022
1023 /* Coerce object */
1024 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001025 s = PyString_AS_STRING(obj);
1026 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001027 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001028 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1029 /* Overwrite the error message with something more useful in
1030 case of a TypeError. */
1031 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001032 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001033 "coercing to Unicode: need string or buffer, "
1034 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001035 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001036 goto onError;
1037 }
Tim Petersced69f82003-09-16 20:30:58 +00001038
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001039 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040 if (len == 0) {
1041 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001042 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 }
Tim Petersced69f82003-09-16 20:30:58 +00001044 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001045 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001046
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001047 return v;
1048
1049 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001050 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051}
1052
1053PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001054 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055 const char *encoding,
1056 const char *errors)
1057{
1058 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001059 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001060 char lower[20]; /* Enough for any encoding name we recognize */
1061 char *l;
1062 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001063
1064 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001065 encoding = PyUnicode_GetDefaultEncoding();
1066
1067 /* Convert encoding to lower case and replace '_' with '-' in order to
1068 catch e.g. UTF_8 */
1069 e = encoding;
1070 l = lower;
1071 while (*e && l < &lower[(sizeof lower) - 2]) {
1072 if (ISUPPER(*e)) {
1073 *l++ = TOLOWER(*e++);
1074 }
1075 else if (*e == '_') {
1076 *l++ = '-';
1077 e++;
1078 }
1079 else {
1080 *l++ = *e++;
1081 }
1082 }
1083 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001084
1085 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001086 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001088 else if ((strcmp(lower, "latin-1") == 0) ||
1089 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001090 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001091#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001092 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001093 return PyUnicode_DecodeMBCS(s, size, errors);
1094#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001095 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001096 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001097 else if (strcmp(lower, "utf-16") == 0)
1098 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1099 else if (strcmp(lower, "utf-32") == 0)
1100 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101
1102 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001103 buffer = NULL;
1104 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1105 goto onError;
1106 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001107 if (buffer == NULL)
1108 goto onError;
1109 unicode = PyCodec_Decode(buffer, encoding, errors);
1110 if (unicode == NULL)
1111 goto onError;
1112 if (!PyUnicode_Check(unicode)) {
1113 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001114 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001115 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 Py_DECREF(unicode);
1117 goto onError;
1118 }
1119 Py_DECREF(buffer);
1120 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001121
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 onError:
1123 Py_XDECREF(buffer);
1124 return NULL;
1125}
1126
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001127PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1128 const char *encoding,
1129 const char *errors)
1130{
1131 PyObject *v;
1132
1133 if (!PyUnicode_Check(unicode)) {
1134 PyErr_BadArgument();
1135 goto onError;
1136 }
1137
1138 if (encoding == NULL)
1139 encoding = PyUnicode_GetDefaultEncoding();
1140
1141 /* Decode via the codec registry */
1142 v = PyCodec_Decode(unicode, encoding, errors);
1143 if (v == NULL)
1144 goto onError;
1145 return v;
1146
1147 onError:
1148 return NULL;
1149}
1150
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001152 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 const char *encoding,
1154 const char *errors)
1155{
1156 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001157
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 unicode = PyUnicode_FromUnicode(s, size);
1159 if (unicode == NULL)
1160 return NULL;
1161 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1162 Py_DECREF(unicode);
1163 return v;
1164}
1165
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001166PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1167 const char *encoding,
1168 const char *errors)
1169{
1170 PyObject *v;
1171
1172 if (!PyUnicode_Check(unicode)) {
1173 PyErr_BadArgument();
1174 goto onError;
1175 }
1176
1177 if (encoding == NULL)
1178 encoding = PyUnicode_GetDefaultEncoding();
1179
1180 /* Encode via the codec registry */
1181 v = PyCodec_Encode(unicode, encoding, errors);
1182 if (v == NULL)
1183 goto onError;
1184 return v;
1185
1186 onError:
1187 return NULL;
1188}
1189
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1191 const char *encoding,
1192 const char *errors)
1193{
1194 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001195
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196 if (!PyUnicode_Check(unicode)) {
1197 PyErr_BadArgument();
1198 goto onError;
1199 }
Fred Drakee4315f52000-05-09 19:53:39 +00001200
Tim Petersced69f82003-09-16 20:30:58 +00001201 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001202 encoding = PyUnicode_GetDefaultEncoding();
1203
1204 /* Shortcuts for common default encodings */
1205 if (errors == NULL) {
1206 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001207 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001208 else if (strcmp(encoding, "latin-1") == 0)
1209 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001210#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1211 else if (strcmp(encoding, "mbcs") == 0)
1212 return PyUnicode_AsMBCSString(unicode);
1213#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001214 else if (strcmp(encoding, "ascii") == 0)
1215 return PyUnicode_AsASCIIString(unicode);
1216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218 /* Encode via the codec registry */
1219 v = PyCodec_Encode(unicode, encoding, errors);
1220 if (v == NULL)
1221 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001222 if (!PyBytes_Check(v)) {
1223 if (PyString_Check(v)) {
1224 /* Old codec, turn it into bytes */
1225 PyObject *b = PyBytes_FromObject(v);
1226 Py_DECREF(v);
1227 return b;
1228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001230 "encoder did not return a bytes object "
1231 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1232 v->ob_type->tp_name,
1233 encoding ? encoding : "NULL",
1234 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235 Py_DECREF(v);
1236 goto onError;
1237 }
1238 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001239
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 onError:
1241 return NULL;
1242}
1243
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001244PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1245 const char *errors)
1246{
1247 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001248 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001249 if (v)
1250 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001251 if (errors != NULL)
1252 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001253 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1254 PyUnicode_GET_SIZE(unicode),
1255 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001256 if (!b)
1257 return NULL;
1258 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1259 PyBytes_Size(b));
1260 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001261 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001262 return v;
1263}
1264
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001265PyObject*
1266PyUnicode_DecodeFSDefault(const char *s)
1267{
1268 Py_ssize_t size = (Py_ssize_t)strlen(s);
1269
1270 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1271 can be undefined. If it is case, decode using UTF-8. The following assumes
1272 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1273 bootstrapping process where the codecs aren't ready yet.
1274 */
1275 if (Py_FileSystemDefaultEncoding) {
1276#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1277 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs")) {
1278 return PyUnicode_DecodeMBCS(s, size, "replace");
1279 }
1280#elif defined(__APPLE__)
1281 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8")) {
1282 return PyUnicode_DecodeUTF8(s, size, "replace");
1283 }
1284#endif
1285 return PyUnicode_Decode(s, size,
1286 Py_FileSystemDefaultEncoding,
1287 "replace");
1288 }
1289 else {
1290 return PyUnicode_DecodeUTF8(s, size, "replace");
1291 }
1292}
1293
Martin v. Löwis5b222132007-06-10 09:51:05 +00001294char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001295PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001296{
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001297 PyObject *str8;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001298 if (!PyUnicode_Check(unicode)) {
1299 PyErr_BadArgument();
1300 return NULL;
1301 }
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001302 str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1303 if (str8 == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001304 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001305 if (psize != NULL)
1306 *psize = PyString_GET_SIZE(str8);
1307 return PyString_AS_STRING(str8);
1308}
1309
1310char*
1311PyUnicode_AsString(PyObject *unicode)
1312{
1313 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001314}
1315
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1317{
1318 if (!PyUnicode_Check(unicode)) {
1319 PyErr_BadArgument();
1320 goto onError;
1321 }
1322 return PyUnicode_AS_UNICODE(unicode);
1323
1324 onError:
1325 return NULL;
1326}
1327
Martin v. Löwis18e16552006-02-15 17:27:45 +00001328Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329{
1330 if (!PyUnicode_Check(unicode)) {
1331 PyErr_BadArgument();
1332 goto onError;
1333 }
1334 return PyUnicode_GET_SIZE(unicode);
1335
1336 onError:
1337 return -1;
1338}
1339
Thomas Wouters78890102000-07-22 19:25:51 +00001340const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001341{
1342 return unicode_default_encoding;
1343}
1344
1345int PyUnicode_SetDefaultEncoding(const char *encoding)
1346{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001347 if (strcmp(encoding, unicode_default_encoding) != 0) {
1348 PyErr_Format(PyExc_ValueError,
1349 "Can only set default encoding to %s",
1350 unicode_default_encoding);
1351 return -1;
1352 }
Fred Drakee4315f52000-05-09 19:53:39 +00001353 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001354}
1355
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001356/* error handling callback helper:
1357 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001358 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001359 and adjust various state variables.
1360 return 0 on success, -1 on error
1361*/
1362
1363static
1364int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1365 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001366 const char **input, const char **inend, Py_ssize_t *startinpos,
1367 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001368 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001369{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001370 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001371
1372 PyObject *restuple = NULL;
1373 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001374 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001375 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001376 Py_ssize_t requiredsize;
1377 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001378 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001379 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001380 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001381 int res = -1;
1382
1383 if (*errorHandler == NULL) {
1384 *errorHandler = PyCodec_LookupError(errors);
1385 if (*errorHandler == NULL)
1386 goto onError;
1387 }
1388
1389 if (*exceptionObject == NULL) {
1390 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001391 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001392 if (*exceptionObject == NULL)
1393 goto onError;
1394 }
1395 else {
1396 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1397 goto onError;
1398 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1399 goto onError;
1400 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1401 goto onError;
1402 }
1403
1404 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1405 if (restuple == NULL)
1406 goto onError;
1407 if (!PyTuple_Check(restuple)) {
1408 PyErr_Format(PyExc_TypeError, &argparse[4]);
1409 goto onError;
1410 }
1411 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1412 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001413
1414 /* Copy back the bytes variables, which might have been modified by the
1415 callback */
1416 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1417 if (!inputobj)
1418 goto onError;
1419 if (!PyBytes_Check(inputobj)) {
1420 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1421 }
1422 *input = PyBytes_AS_STRING(inputobj);
1423 insize = PyBytes_GET_SIZE(inputobj);
1424 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001425 /* we can DECREF safely, as the exception has another reference,
1426 so the object won't go away. */
1427 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001428
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001430 newpos = insize+newpos;
1431 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001432 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001433 goto onError;
1434 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435
1436 /* need more space? (at least enough for what we
1437 have+the replacement+the rest of the string (starting
1438 at the new input position), so we won't have to check space
1439 when there are no errors in the rest of the string) */
1440 repptr = PyUnicode_AS_UNICODE(repunicode);
1441 repsize = PyUnicode_GET_SIZE(repunicode);
1442 requiredsize = *outpos + repsize + insize-newpos;
1443 if (requiredsize > outsize) {
1444 if (requiredsize<2*outsize)
1445 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001446 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001447 goto onError;
1448 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1449 }
1450 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001451 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452 Py_UNICODE_COPY(*outptr, repptr, repsize);
1453 *outptr += repsize;
1454 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001455
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001456 /* we made it! */
1457 res = 0;
1458
1459 onError:
1460 Py_XDECREF(restuple);
1461 return res;
1462}
1463
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001464/* --- UTF-7 Codec -------------------------------------------------------- */
1465
1466/* see RFC2152 for details */
1467
Tim Petersced69f82003-09-16 20:30:58 +00001468static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001469char utf7_special[128] = {
1470 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1471 encoded:
1472 0 - not special
1473 1 - special
1474 2 - whitespace (optional)
1475 3 - RFC2152 Set O (optional) */
1476 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1477 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1478 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1480 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1482 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1484
1485};
1486
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001487/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1488 warnings about the comparison always being false; since
1489 utf7_special[0] is 1, we can safely make that one comparison
1490 true */
1491
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001492#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001493 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001494 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001495 (encodeO && (utf7_special[(c)] == 3)))
1496
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001497#define B64(n) \
1498 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1499#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001500 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001501#define UB64(c) \
1502 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1503 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001504
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001505#define ENCODE(out, ch, bits) \
1506 while (bits >= 6) { \
1507 *out++ = B64(ch >> (bits-6)); \
1508 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001509 }
1510
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001511#define DECODE(out, ch, bits, surrogate) \
1512 while (bits >= 16) { \
1513 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1514 bits -= 16; \
1515 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001516 /* We have already generated an error for the high surrogate \
1517 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001518 surrogate = 0; \
1519 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001520 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001521 it in a 16-bit character */ \
1522 surrogate = 1; \
1523 errmsg = "code pairs are not supported"; \
1524 goto utf7Error; \
1525 } else { \
1526 *out++ = outCh; \
1527 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001528 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001529
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001530PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001531 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001532 const char *errors)
1533{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001534 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001535 Py_ssize_t startinpos;
1536 Py_ssize_t endinpos;
1537 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001538 const char *e;
1539 PyUnicodeObject *unicode;
1540 Py_UNICODE *p;
1541 const char *errmsg = "";
1542 int inShift = 0;
1543 unsigned int bitsleft = 0;
1544 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 int surrogate = 0;
1546 PyObject *errorHandler = NULL;
1547 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001548
1549 unicode = _PyUnicode_New(size);
1550 if (!unicode)
1551 return NULL;
1552 if (size == 0)
1553 return (PyObject *)unicode;
1554
1555 p = unicode->str;
1556 e = s + size;
1557
1558 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001559 Py_UNICODE ch;
1560 restart:
1561 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001562
1563 if (inShift) {
1564 if ((ch == '-') || !B64CHAR(ch)) {
1565 inShift = 0;
1566 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001567
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001568 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1569 if (bitsleft >= 6) {
1570 /* The shift sequence has a partial character in it. If
1571 bitsleft < 6 then we could just classify it as padding
1572 but that is not the case here */
1573
1574 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001575 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001576 }
1577 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001578 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001579 here so indicate the potential of a misencoded character. */
1580
1581 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1582 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1583 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001584 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001585 }
1586
1587 if (ch == '-') {
1588 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001589 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001590 inShift = 1;
1591 }
1592 } else if (SPECIAL(ch,0,0)) {
1593 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001594 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 } else {
1596 *p++ = ch;
1597 }
1598 } else {
1599 charsleft = (charsleft << 6) | UB64(ch);
1600 bitsleft += 6;
1601 s++;
1602 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1603 }
1604 }
1605 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001606 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607 s++;
1608 if (s < e && *s == '-') {
1609 s++;
1610 *p++ = '+';
1611 } else
1612 {
1613 inShift = 1;
1614 bitsleft = 0;
1615 }
1616 }
1617 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001618 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001619 errmsg = "unexpected special character";
1620 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001621 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622 }
1623 else {
1624 *p++ = ch;
1625 s++;
1626 }
1627 continue;
1628 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001629 outpos = p-PyUnicode_AS_UNICODE(unicode);
1630 endinpos = s-starts;
1631 if (unicode_decode_call_errorhandler(
1632 errors, &errorHandler,
1633 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001634 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001635 (PyObject **)&unicode, &outpos, &p))
1636 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001637 }
1638
1639 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001640 outpos = p-PyUnicode_AS_UNICODE(unicode);
1641 endinpos = size;
1642 if (unicode_decode_call_errorhandler(
1643 errors, &errorHandler,
1644 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001645 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001647 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001648 if (s < e)
1649 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 }
1651
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001652 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 goto onError;
1654
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001655 Py_XDECREF(errorHandler);
1656 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657 return (PyObject *)unicode;
1658
1659onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001660 Py_XDECREF(errorHandler);
1661 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662 Py_DECREF(unicode);
1663 return NULL;
1664}
1665
1666
1667PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001668 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001669 int encodeSetO,
1670 int encodeWhiteSpace,
1671 const char *errors)
1672{
1673 PyObject *v;
1674 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001675 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001676 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001677 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001678 unsigned int bitsleft = 0;
1679 unsigned long charsleft = 0;
1680 char * out;
1681 char * start;
1682
1683 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001684 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685
Walter Dörwald51ab4142007-05-05 14:43:36 +00001686 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001687 if (v == NULL)
1688 return NULL;
1689
Walter Dörwald51ab4142007-05-05 14:43:36 +00001690 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 for (;i < size; ++i) {
1692 Py_UNICODE ch = s[i];
1693
1694 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001695 if (ch == '+') {
1696 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001697 *out++ = '-';
1698 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1699 charsleft = ch;
1700 bitsleft = 16;
1701 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001702 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001703 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001704 } else {
1705 *out++ = (char) ch;
1706 }
1707 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001708 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1709 *out++ = B64(charsleft << (6-bitsleft));
1710 charsleft = 0;
1711 bitsleft = 0;
1712 /* Characters not in the BASE64 set implicitly unshift the sequence
1713 so no '-' is required, except if the character is itself a '-' */
1714 if (B64CHAR(ch) || ch == '-') {
1715 *out++ = '-';
1716 }
1717 inShift = 0;
1718 *out++ = (char) ch;
1719 } else {
1720 bitsleft += 16;
1721 charsleft = (charsleft << 16) | ch;
1722 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1723
1724 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001725 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001726 or '-' then the shift sequence will be terminated implicitly and we
1727 don't have to insert a '-'. */
1728
1729 if (bitsleft == 0) {
1730 if (i + 1 < size) {
1731 Py_UNICODE ch2 = s[i+1];
1732
1733 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001734
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001735 } else if (B64CHAR(ch2) || ch2 == '-') {
1736 *out++ = '-';
1737 inShift = 0;
1738 } else {
1739 inShift = 0;
1740 }
1741
1742 }
1743 else {
1744 *out++ = '-';
1745 inShift = 0;
1746 }
1747 }
Tim Petersced69f82003-09-16 20:30:58 +00001748 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001750 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001751 if (bitsleft) {
1752 *out++= B64(charsleft << (6-bitsleft) );
1753 *out++ = '-';
1754 }
1755
Walter Dörwald51ab4142007-05-05 14:43:36 +00001756 if (PyBytes_Resize(v, out - start)) {
1757 Py_DECREF(v);
1758 return NULL;
1759 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001760 return v;
1761}
1762
1763#undef SPECIAL
1764#undef B64
1765#undef B64CHAR
1766#undef UB64
1767#undef ENCODE
1768#undef DECODE
1769
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770/* --- UTF-8 Codec -------------------------------------------------------- */
1771
Tim Petersced69f82003-09-16 20:30:58 +00001772static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773char utf8_code_length[256] = {
1774 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1775 illegal prefix. see RFC 2279 for details */
1776 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1777 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1778 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1779 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1780 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1781 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1782 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1783 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1785 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1786 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1787 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1788 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1789 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1790 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1791 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1792};
1793
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001795 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796 const char *errors)
1797{
Walter Dörwald69652032004-09-07 20:24:22 +00001798 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1799}
1800
1801PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001802 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001803 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001804 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001805{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001808 Py_ssize_t startinpos;
1809 Py_ssize_t endinpos;
1810 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811 const char *e;
1812 PyUnicodeObject *unicode;
1813 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001814 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 PyObject *errorHandler = NULL;
1816 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001817
1818 /* Note: size will always be longer than the resulting Unicode
1819 character count */
1820 unicode = _PyUnicode_New(size);
1821 if (!unicode)
1822 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001823 if (size == 0) {
1824 if (consumed)
1825 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001827 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001828
1829 /* Unpack UTF-8 encoded data */
1830 p = unicode->str;
1831 e = s + size;
1832
1833 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001834 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835
1836 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001837 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 s++;
1839 continue;
1840 }
1841
1842 n = utf8_code_length[ch];
1843
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001844 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001845 if (consumed)
1846 break;
1847 else {
1848 errmsg = "unexpected end of data";
1849 startinpos = s-starts;
1850 endinpos = size;
1851 goto utf8Error;
1852 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
1855 switch (n) {
1856
1857 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001858 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001859 startinpos = s-starts;
1860 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001861 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862
1863 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001864 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001865 startinpos = s-starts;
1866 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001867 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868
1869 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001870 if ((s[1] & 0xc0) != 0x80) {
1871 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001872 startinpos = s-starts;
1873 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001874 goto utf8Error;
1875 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001877 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001878 startinpos = s-starts;
1879 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001880 errmsg = "illegal encoding";
1881 goto utf8Error;
1882 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001884 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 break;
1886
1887 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001888 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001889 (s[2] & 0xc0) != 0x80) {
1890 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001891 startinpos = s-starts;
1892 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001893 goto utf8Error;
1894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001896 if (ch < 0x0800) {
1897 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001898 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001899
1900 XXX For wide builds (UCS-4) we should probably try
1901 to recombine the surrogates into a single code
1902 unit.
1903 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001904 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001905 startinpos = s-starts;
1906 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001907 goto utf8Error;
1908 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001910 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001911 break;
1912
1913 case 4:
1914 if ((s[1] & 0xc0) != 0x80 ||
1915 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001916 (s[3] & 0xc0) != 0x80) {
1917 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001918 startinpos = s-starts;
1919 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001920 goto utf8Error;
1921 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001922 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1923 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1924 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001925 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001926 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001927 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001928 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001929 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001930 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001931 startinpos = s-starts;
1932 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001933 goto utf8Error;
1934 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001935#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001936 *p++ = (Py_UNICODE)ch;
1937#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001938 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001939
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001940 /* translate from 10000..10FFFF to 0..FFFF */
1941 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001942
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001943 /* high surrogate = top 10 bits added to D800 */
1944 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001945
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001946 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001947 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001948#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 break;
1950
1951 default:
1952 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001953 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001954 startinpos = s-starts;
1955 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001956 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957 }
1958 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001960
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001961 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001962 outpos = p-PyUnicode_AS_UNICODE(unicode);
1963 if (unicode_decode_call_errorhandler(
1964 errors, &errorHandler,
1965 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001966 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001967 (PyObject **)&unicode, &outpos, &p))
1968 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969 }
Walter Dörwald69652032004-09-07 20:24:22 +00001970 if (consumed)
1971 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972
1973 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001974 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 goto onError;
1976
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001977 Py_XDECREF(errorHandler);
1978 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 return (PyObject *)unicode;
1980
1981onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001982 Py_XDECREF(errorHandler);
1983 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 Py_DECREF(unicode);
1985 return NULL;
1986}
1987
Tim Peters602f7402002-04-27 18:03:26 +00001988/* Allocation strategy: if the string is short, convert into a stack buffer
1989 and allocate exactly as much space needed at the end. Else allocate the
1990 maximum possible needed (4 result bytes per Unicode character), and return
1991 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001992*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001993PyObject *
1994PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001995 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001996 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997{
Tim Peters602f7402002-04-27 18:03:26 +00001998#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001999
Martin v. Löwis18e16552006-02-15 17:27:45 +00002000 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002001 PyObject *v; /* result string object */
2002 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002003 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002004 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002005 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002006
Tim Peters602f7402002-04-27 18:03:26 +00002007 assert(s != NULL);
2008 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009
Tim Peters602f7402002-04-27 18:03:26 +00002010 if (size <= MAX_SHORT_UNICHARS) {
2011 /* Write into the stack buffer; nallocated can't overflow.
2012 * At the end, we'll allocate exactly as much heap space as it
2013 * turns out we need.
2014 */
2015 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2016 v = NULL; /* will allocate after we're done */
2017 p = stackbuf;
2018 }
2019 else {
2020 /* Overallocate on the heap, and give the excess back at the end. */
2021 nallocated = size * 4;
2022 if (nallocated / 4 != size) /* overflow! */
2023 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002024 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002025 if (v == NULL)
2026 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002027 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002028 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002029
Tim Peters602f7402002-04-27 18:03:26 +00002030 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002031 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002032
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002033 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002034 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002036
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002038 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002039 *p++ = (char)(0xc0 | (ch >> 6));
2040 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002041 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002042 else {
Tim Peters602f7402002-04-27 18:03:26 +00002043 /* Encode UCS2 Unicode ordinals */
2044 if (ch < 0x10000) {
2045 /* Special case: check for high surrogate */
2046 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2047 Py_UCS4 ch2 = s[i];
2048 /* Check for low surrogate and combine the two to
2049 form a UCS4 value */
2050 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002051 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002052 i++;
2053 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002054 }
Tim Peters602f7402002-04-27 18:03:26 +00002055 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002056 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002057 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002058 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2059 *p++ = (char)(0x80 | (ch & 0x3f));
2060 continue;
2061 }
2062encodeUCS4:
2063 /* Encode UCS4 Unicode ordinals */
2064 *p++ = (char)(0xf0 | (ch >> 18));
2065 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2066 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2067 *p++ = (char)(0x80 | (ch & 0x3f));
2068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002070
Tim Peters602f7402002-04-27 18:03:26 +00002071 if (v == NULL) {
2072 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002073 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002074 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002075 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002076 }
2077 else {
2078 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002079 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002080 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002081 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002082 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002083 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002084
Tim Peters602f7402002-04-27 18:03:26 +00002085#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086}
2087
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2089{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090 if (!PyUnicode_Check(unicode)) {
2091 PyErr_BadArgument();
2092 return NULL;
2093 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002094 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2095 PyUnicode_GET_SIZE(unicode),
2096 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097}
2098
Walter Dörwald41980ca2007-08-16 21:55:45 +00002099/* --- UTF-32 Codec ------------------------------------------------------- */
2100
2101PyObject *
2102PyUnicode_DecodeUTF32(const char *s,
2103 Py_ssize_t size,
2104 const char *errors,
2105 int *byteorder)
2106{
2107 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2108}
2109
2110PyObject *
2111PyUnicode_DecodeUTF32Stateful(const char *s,
2112 Py_ssize_t size,
2113 const char *errors,
2114 int *byteorder,
2115 Py_ssize_t *consumed)
2116{
2117 const char *starts = s;
2118 Py_ssize_t startinpos;
2119 Py_ssize_t endinpos;
2120 Py_ssize_t outpos;
2121 PyUnicodeObject *unicode;
2122 Py_UNICODE *p;
2123#ifndef Py_UNICODE_WIDE
2124 int i, pairs;
2125#else
2126 const int pairs = 0;
2127#endif
2128 const unsigned char *q, *e;
2129 int bo = 0; /* assume native ordering by default */
2130 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002131 /* Offsets from q for retrieving bytes in the right order. */
2132#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2133 int iorder[] = {0, 1, 2, 3};
2134#else
2135 int iorder[] = {3, 2, 1, 0};
2136#endif
2137 PyObject *errorHandler = NULL;
2138 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002139 /* On narrow builds we split characters outside the BMP into two
2140 codepoints => count how much extra space we need. */
2141#ifndef Py_UNICODE_WIDE
2142 for (i = pairs = 0; i < size/4; i++)
2143 if (((Py_UCS4 *)s)[i] >= 0x10000)
2144 pairs++;
2145#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002146
2147 /* This might be one to much, because of a BOM */
2148 unicode = _PyUnicode_New((size+3)/4+pairs);
2149 if (!unicode)
2150 return NULL;
2151 if (size == 0)
2152 return (PyObject *)unicode;
2153
2154 /* Unpack UTF-32 encoded data */
2155 p = unicode->str;
2156 q = (unsigned char *)s;
2157 e = q + size;
2158
2159 if (byteorder)
2160 bo = *byteorder;
2161
2162 /* Check for BOM marks (U+FEFF) in the input and adjust current
2163 byte order setting accordingly. In native mode, the leading BOM
2164 mark is skipped, in all other modes, it is copied to the output
2165 stream as-is (giving a ZWNBSP character). */
2166 if (bo == 0) {
2167 if (size >= 4) {
2168 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2169 (q[iorder[1]] << 8) | q[iorder[0]];
2170#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2171 if (bom == 0x0000FEFF) {
2172 q += 4;
2173 bo = -1;
2174 }
2175 else if (bom == 0xFFFE0000) {
2176 q += 4;
2177 bo = 1;
2178 }
2179#else
2180 if (bom == 0x0000FEFF) {
2181 q += 4;
2182 bo = 1;
2183 }
2184 else if (bom == 0xFFFE0000) {
2185 q += 4;
2186 bo = -1;
2187 }
2188#endif
2189 }
2190 }
2191
2192 if (bo == -1) {
2193 /* force LE */
2194 iorder[0] = 0;
2195 iorder[1] = 1;
2196 iorder[2] = 2;
2197 iorder[3] = 3;
2198 }
2199 else if (bo == 1) {
2200 /* force BE */
2201 iorder[0] = 3;
2202 iorder[1] = 2;
2203 iorder[2] = 1;
2204 iorder[3] = 0;
2205 }
2206
2207 while (q < e) {
2208 Py_UCS4 ch;
2209 /* remaining bytes at the end? (size should be divisible by 4) */
2210 if (e-q<4) {
2211 if (consumed)
2212 break;
2213 errmsg = "truncated data";
2214 startinpos = ((const char *)q)-starts;
2215 endinpos = ((const char *)e)-starts;
2216 goto utf32Error;
2217 /* The remaining input chars are ignored if the callback
2218 chooses to skip the input */
2219 }
2220 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2221 (q[iorder[1]] << 8) | q[iorder[0]];
2222
2223 if (ch >= 0x110000)
2224 {
2225 errmsg = "codepoint not in range(0x110000)";
2226 startinpos = ((const char *)q)-starts;
2227 endinpos = startinpos+4;
2228 goto utf32Error;
2229 }
2230#ifndef Py_UNICODE_WIDE
2231 if (ch >= 0x10000)
2232 {
2233 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2234 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2235 }
2236 else
2237#endif
2238 *p++ = ch;
2239 q += 4;
2240 continue;
2241 utf32Error:
2242 outpos = p-PyUnicode_AS_UNICODE(unicode);
2243 if (unicode_decode_call_errorhandler(
2244 errors, &errorHandler,
2245 "utf32", errmsg,
2246 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2247 (PyObject **)&unicode, &outpos, &p))
2248 goto onError;
2249 }
2250
2251 if (byteorder)
2252 *byteorder = bo;
2253
2254 if (consumed)
2255 *consumed = (const char *)q-starts;
2256
2257 /* Adjust length */
2258 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2259 goto onError;
2260
2261 Py_XDECREF(errorHandler);
2262 Py_XDECREF(exc);
2263 return (PyObject *)unicode;
2264
2265onError:
2266 Py_DECREF(unicode);
2267 Py_XDECREF(errorHandler);
2268 Py_XDECREF(exc);
2269 return NULL;
2270}
2271
2272PyObject *
2273PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2274 Py_ssize_t size,
2275 const char *errors,
2276 int byteorder)
2277{
2278 PyObject *v;
2279 unsigned char *p;
2280#ifndef Py_UNICODE_WIDE
2281 int i, pairs;
2282#else
2283 const int pairs = 0;
2284#endif
2285 /* Offsets from p for storing byte pairs in the right order. */
2286#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2287 int iorder[] = {0, 1, 2, 3};
2288#else
2289 int iorder[] = {3, 2, 1, 0};
2290#endif
2291
2292#define STORECHAR(CH) \
2293 do { \
2294 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2295 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2296 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2297 p[iorder[0]] = (CH) & 0xff; \
2298 p += 4; \
2299 } while(0)
2300
2301 /* In narrow builds we can output surrogate pairs as one codepoint,
2302 so we need less space. */
2303#ifndef Py_UNICODE_WIDE
2304 for (i = pairs = 0; i < size-1; i++)
2305 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2306 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2307 pairs++;
2308#endif
2309 v = PyBytes_FromStringAndSize(NULL,
2310 4 * (size - pairs + (byteorder == 0)));
2311 if (v == NULL)
2312 return NULL;
2313
2314 p = (unsigned char *)PyBytes_AS_STRING(v);
2315 if (byteorder == 0)
2316 STORECHAR(0xFEFF);
2317 if (size == 0)
2318 return v;
2319
2320 if (byteorder == -1) {
2321 /* force LE */
2322 iorder[0] = 0;
2323 iorder[1] = 1;
2324 iorder[2] = 2;
2325 iorder[3] = 3;
2326 }
2327 else if (byteorder == 1) {
2328 /* force BE */
2329 iorder[0] = 3;
2330 iorder[1] = 2;
2331 iorder[2] = 1;
2332 iorder[3] = 0;
2333 }
2334
2335 while (size-- > 0) {
2336 Py_UCS4 ch = *s++;
2337#ifndef Py_UNICODE_WIDE
2338 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2339 Py_UCS4 ch2 = *s;
2340 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2341 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2342 s++;
2343 size--;
2344 }
2345 }
2346#endif
2347 STORECHAR(ch);
2348 }
2349 return v;
2350#undef STORECHAR
2351}
2352
2353PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2354{
2355 if (!PyUnicode_Check(unicode)) {
2356 PyErr_BadArgument();
2357 return NULL;
2358 }
2359 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2360 PyUnicode_GET_SIZE(unicode),
2361 NULL,
2362 0);
2363}
2364
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365/* --- UTF-16 Codec ------------------------------------------------------- */
2366
Tim Peters772747b2001-08-09 22:21:55 +00002367PyObject *
2368PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002369 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002370 const char *errors,
2371 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002372{
Walter Dörwald69652032004-09-07 20:24:22 +00002373 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2374}
2375
2376PyObject *
2377PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002378 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002379 const char *errors,
2380 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002381 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002382{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002383 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002384 Py_ssize_t startinpos;
2385 Py_ssize_t endinpos;
2386 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 PyUnicodeObject *unicode;
2388 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002389 const unsigned char *q, *e;
2390 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002391 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002392 /* Offsets from q for retrieving byte pairs in the right order. */
2393#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2394 int ihi = 1, ilo = 0;
2395#else
2396 int ihi = 0, ilo = 1;
2397#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 PyObject *errorHandler = NULL;
2399 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400
2401 /* Note: size will always be longer than the resulting Unicode
2402 character count */
2403 unicode = _PyUnicode_New(size);
2404 if (!unicode)
2405 return NULL;
2406 if (size == 0)
2407 return (PyObject *)unicode;
2408
2409 /* Unpack UTF-16 encoded data */
2410 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002411 q = (unsigned char *)s;
2412 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413
2414 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002415 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002416
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002417 /* Check for BOM marks (U+FEFF) in the input and adjust current
2418 byte order setting accordingly. In native mode, the leading BOM
2419 mark is skipped, in all other modes, it is copied to the output
2420 stream as-is (giving a ZWNBSP character). */
2421 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002422 if (size >= 2) {
2423 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002424#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002425 if (bom == 0xFEFF) {
2426 q += 2;
2427 bo = -1;
2428 }
2429 else if (bom == 0xFFFE) {
2430 q += 2;
2431 bo = 1;
2432 }
Tim Petersced69f82003-09-16 20:30:58 +00002433#else
Walter Dörwald69652032004-09-07 20:24:22 +00002434 if (bom == 0xFEFF) {
2435 q += 2;
2436 bo = 1;
2437 }
2438 else if (bom == 0xFFFE) {
2439 q += 2;
2440 bo = -1;
2441 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002442#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002443 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002444 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002445
Tim Peters772747b2001-08-09 22:21:55 +00002446 if (bo == -1) {
2447 /* force LE */
2448 ihi = 1;
2449 ilo = 0;
2450 }
2451 else if (bo == 1) {
2452 /* force BE */
2453 ihi = 0;
2454 ilo = 1;
2455 }
2456
2457 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002458 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002459 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002460 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002461 if (consumed)
2462 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002463 errmsg = "truncated data";
2464 startinpos = ((const char *)q)-starts;
2465 endinpos = ((const char *)e)-starts;
2466 goto utf16Error;
2467 /* The remaining input chars are ignored if the callback
2468 chooses to skip the input */
2469 }
2470 ch = (q[ihi] << 8) | q[ilo];
2471
Tim Peters772747b2001-08-09 22:21:55 +00002472 q += 2;
2473
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 if (ch < 0xD800 || ch > 0xDFFF) {
2475 *p++ = ch;
2476 continue;
2477 }
2478
2479 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002480 if (q >= e) {
2481 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002482 startinpos = (((const char *)q)-2)-starts;
2483 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002484 goto utf16Error;
2485 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002486 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002487 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2488 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002489 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002490#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002491 *p++ = ch;
2492 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002493#else
2494 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002495#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002496 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002497 }
2498 else {
2499 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500 startinpos = (((const char *)q)-4)-starts;
2501 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002502 goto utf16Error;
2503 }
2504
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002506 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002507 startinpos = (((const char *)q)-2)-starts;
2508 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002509 /* Fall through to report the error */
2510
2511 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002512 outpos = p-PyUnicode_AS_UNICODE(unicode);
2513 if (unicode_decode_call_errorhandler(
2514 errors, &errorHandler,
2515 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002516 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002517 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002518 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 }
2520
2521 if (byteorder)
2522 *byteorder = bo;
2523
Walter Dörwald69652032004-09-07 20:24:22 +00002524 if (consumed)
2525 *consumed = (const char *)q-starts;
2526
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002528 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 goto onError;
2530
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002531 Py_XDECREF(errorHandler);
2532 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533 return (PyObject *)unicode;
2534
2535onError:
2536 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002537 Py_XDECREF(errorHandler);
2538 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 return NULL;
2540}
2541
Tim Peters772747b2001-08-09 22:21:55 +00002542PyObject *
2543PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002544 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002545 const char *errors,
2546 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547{
2548 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002549 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002550#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002551 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002552#else
2553 const int pairs = 0;
2554#endif
Tim Peters772747b2001-08-09 22:21:55 +00002555 /* Offsets from p for storing byte pairs in the right order. */
2556#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2557 int ihi = 1, ilo = 0;
2558#else
2559 int ihi = 0, ilo = 1;
2560#endif
2561
2562#define STORECHAR(CH) \
2563 do { \
2564 p[ihi] = ((CH) >> 8) & 0xff; \
2565 p[ilo] = (CH) & 0xff; \
2566 p += 2; \
2567 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002569#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002570 for (i = pairs = 0; i < size; i++)
2571 if (s[i] >= 0x10000)
2572 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002573#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002574 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002575 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576 if (v == NULL)
2577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578
Walter Dörwald3cc34522007-05-04 10:48:27 +00002579 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002581 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002582 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002583 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002584
2585 if (byteorder == -1) {
2586 /* force LE */
2587 ihi = 1;
2588 ilo = 0;
2589 }
2590 else if (byteorder == 1) {
2591 /* force BE */
2592 ihi = 0;
2593 ilo = 1;
2594 }
2595
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002596 while (size-- > 0) {
2597 Py_UNICODE ch = *s++;
2598 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002599#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002600 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002601 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2602 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002604#endif
Tim Peters772747b2001-08-09 22:21:55 +00002605 STORECHAR(ch);
2606 if (ch2)
2607 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002610#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611}
2612
2613PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2614{
2615 if (!PyUnicode_Check(unicode)) {
2616 PyErr_BadArgument();
2617 return NULL;
2618 }
2619 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2620 PyUnicode_GET_SIZE(unicode),
2621 NULL,
2622 0);
2623}
2624
2625/* --- Unicode Escape Codec ----------------------------------------------- */
2626
Fredrik Lundh06d12682001-01-24 07:59:11 +00002627static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002628
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002630 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631 const char *errors)
2632{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002633 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002634 Py_ssize_t startinpos;
2635 Py_ssize_t endinpos;
2636 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002637 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002639 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002641 char* message;
2642 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002643 PyObject *errorHandler = NULL;
2644 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002645
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646 /* Escaped strings will always be longer than the resulting
2647 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002648 length after conversion to the true value.
2649 (but if the error callback returns a long replacement string
2650 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 v = _PyUnicode_New(size);
2652 if (v == NULL)
2653 goto onError;
2654 if (size == 0)
2655 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002656
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002657 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002659
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660 while (s < end) {
2661 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002662 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002663 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664
2665 /* Non-escape characters are interpreted as Unicode ordinals */
2666 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002667 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 continue;
2669 }
2670
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002671 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672 /* \ - Escapes */
2673 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002674 c = *s++;
2675 if (s > end)
2676 c = '\0'; /* Invalid after \ */
2677 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678
2679 /* \x escapes */
2680 case '\n': break;
2681 case '\\': *p++ = '\\'; break;
2682 case '\'': *p++ = '\''; break;
2683 case '\"': *p++ = '\"'; break;
2684 case 'b': *p++ = '\b'; break;
2685 case 'f': *p++ = '\014'; break; /* FF */
2686 case 't': *p++ = '\t'; break;
2687 case 'n': *p++ = '\n'; break;
2688 case 'r': *p++ = '\r'; break;
2689 case 'v': *p++ = '\013'; break; /* VT */
2690 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2691
2692 /* \OOO (octal) escapes */
2693 case '0': case '1': case '2': case '3':
2694 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002695 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002696 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002697 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002698 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002699 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002701 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 break;
2703
Fredrik Lundhccc74732001-02-18 22:13:49 +00002704 /* hex escapes */
2705 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002707 digits = 2;
2708 message = "truncated \\xXX escape";
2709 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710
Fredrik Lundhccc74732001-02-18 22:13:49 +00002711 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002713 digits = 4;
2714 message = "truncated \\uXXXX escape";
2715 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716
Fredrik Lundhccc74732001-02-18 22:13:49 +00002717 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002718 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002719 digits = 8;
2720 message = "truncated \\UXXXXXXXX escape";
2721 hexescape:
2722 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002723 outpos = p-PyUnicode_AS_UNICODE(v);
2724 if (s+digits>end) {
2725 endinpos = size;
2726 if (unicode_decode_call_errorhandler(
2727 errors, &errorHandler,
2728 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002729 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002730 (PyObject **)&v, &outpos, &p))
2731 goto onError;
2732 goto nextByte;
2733 }
2734 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002735 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002736 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002737 endinpos = (s+i+1)-starts;
2738 if (unicode_decode_call_errorhandler(
2739 errors, &errorHandler,
2740 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002741 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002743 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002745 }
2746 chr = (chr<<4) & ~0xF;
2747 if (c >= '0' && c <= '9')
2748 chr += c - '0';
2749 else if (c >= 'a' && c <= 'f')
2750 chr += 10 + c - 'a';
2751 else
2752 chr += 10 + c - 'A';
2753 }
2754 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002755 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002756 /* _decoding_error will have already written into the
2757 target buffer. */
2758 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002759 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002760 /* when we get here, chr is a 32-bit unicode character */
2761 if (chr <= 0xffff)
2762 /* UCS-2 character */
2763 *p++ = (Py_UNICODE) chr;
2764 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002765 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002766 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002767#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002768 *p++ = chr;
2769#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002770 chr -= 0x10000L;
2771 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002772 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002773#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002774 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002775 endinpos = s-starts;
2776 outpos = p-PyUnicode_AS_UNICODE(v);
2777 if (unicode_decode_call_errorhandler(
2778 errors, &errorHandler,
2779 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002780 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002781 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002782 goto onError;
2783 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002784 break;
2785
2786 /* \N{name} */
2787 case 'N':
2788 message = "malformed \\N character escape";
2789 if (ucnhash_CAPI == NULL) {
2790 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002791 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002792 m = PyImport_ImportModule("unicodedata");
2793 if (m == NULL)
2794 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002795 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002796 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002797 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002798 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002799 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002800 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002801 if (ucnhash_CAPI == NULL)
2802 goto ucnhashError;
2803 }
2804 if (*s == '{') {
2805 const char *start = s+1;
2806 /* look for the closing brace */
2807 while (*s != '}' && s < end)
2808 s++;
2809 if (s > start && s < end && *s == '}') {
2810 /* found a name. look it up in the unicode database */
2811 message = "unknown Unicode character name";
2812 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002813 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002814 goto store;
2815 }
2816 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 endinpos = s-starts;
2818 outpos = p-PyUnicode_AS_UNICODE(v);
2819 if (unicode_decode_call_errorhandler(
2820 errors, &errorHandler,
2821 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002822 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002823 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002824 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002825 break;
2826
2827 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002828 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002829 message = "\\ at end of string";
2830 s--;
2831 endinpos = s-starts;
2832 outpos = p-PyUnicode_AS_UNICODE(v);
2833 if (unicode_decode_call_errorhandler(
2834 errors, &errorHandler,
2835 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002836 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002837 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002838 goto onError;
2839 }
2840 else {
2841 *p++ = '\\';
2842 *p++ = (unsigned char)s[-1];
2843 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002844 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002846 nextByte:
2847 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002849 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002850 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002851 Py_XDECREF(errorHandler);
2852 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002854
Fredrik Lundhccc74732001-02-18 22:13:49 +00002855ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002856 PyErr_SetString(
2857 PyExc_UnicodeError,
2858 "\\N escapes not supported (can't load unicodedata module)"
2859 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002860 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861 Py_XDECREF(errorHandler);
2862 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002863 return NULL;
2864
Fredrik Lundhccc74732001-02-18 22:13:49 +00002865onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 Py_XDECREF(errorHandler);
2868 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 return NULL;
2870}
2871
2872/* Return a Unicode-Escape string version of the Unicode object.
2873
2874 If quotes is true, the string is enclosed in u"" or u'' quotes as
2875 appropriate.
2876
2877*/
2878
Thomas Wouters477c8d52006-05-27 19:21:47 +00002879Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2880 Py_ssize_t size,
2881 Py_UNICODE ch)
2882{
2883 /* like wcschr, but doesn't stop at NULL characters */
2884
2885 while (size-- > 0) {
2886 if (*s == ch)
2887 return s;
2888 s++;
2889 }
2890
2891 return NULL;
2892}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002893
Walter Dörwald79e913e2007-05-12 11:08:06 +00002894static const char *hexdigits = "0123456789abcdef";
2895
2896PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2897 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898{
2899 PyObject *repr;
2900 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901
Thomas Wouters89f507f2006-12-13 04:49:30 +00002902 /* XXX(nnorwitz): rather than over-allocating, it would be
2903 better to choose a different scheme. Perhaps scan the
2904 first N-chars of the string and allocate based on that size.
2905 */
2906 /* Initial allocation is based on the longest-possible unichr
2907 escape.
2908
2909 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2910 unichr, so in this case it's the longest unichr escape. In
2911 narrow (UTF-16) builds this is five chars per source unichr
2912 since there are two unichrs in the surrogate pair, so in narrow
2913 (UTF-16) builds it's not the longest unichr escape.
2914
2915 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2916 so in the narrow (UTF-16) build case it's the longest unichr
2917 escape.
2918 */
2919
Walter Dörwald79e913e2007-05-12 11:08:06 +00002920 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002921#ifdef Py_UNICODE_WIDE
2922 + 10*size
2923#else
2924 + 6*size
2925#endif
2926 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002927 if (repr == NULL)
2928 return NULL;
2929
Walter Dörwald79e913e2007-05-12 11:08:06 +00002930 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002931
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932 while (size-- > 0) {
2933 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002934
Walter Dörwald79e913e2007-05-12 11:08:06 +00002935 /* Escape backslashes */
2936 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002937 *p++ = '\\';
2938 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002939 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002940 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002941
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002942#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002943 /* Map 21-bit characters to '\U00xxxxxx' */
2944 else if (ch >= 0x10000) {
2945 *p++ = '\\';
2946 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002947 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2948 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2949 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2950 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2951 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2952 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2953 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2954 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002955 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002956 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002957#else
2958 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002959 else if (ch >= 0xD800 && ch < 0xDC00) {
2960 Py_UNICODE ch2;
2961 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002962
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002963 ch2 = *s++;
2964 size--;
2965 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2966 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2967 *p++ = '\\';
2968 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002969 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2970 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2971 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2972 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2973 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2974 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2975 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2976 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002977 continue;
2978 }
2979 /* Fall through: isolated surrogates are copied as-is */
2980 s--;
2981 size++;
2982 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002983#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002984
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002986 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 *p++ = '\\';
2988 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002989 *p++ = hexdigits[(ch >> 12) & 0x000F];
2990 *p++ = hexdigits[(ch >> 8) & 0x000F];
2991 *p++ = hexdigits[(ch >> 4) & 0x000F];
2992 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002994
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002995 /* Map special whitespace to '\t', \n', '\r' */
2996 else if (ch == '\t') {
2997 *p++ = '\\';
2998 *p++ = 't';
2999 }
3000 else if (ch == '\n') {
3001 *p++ = '\\';
3002 *p++ = 'n';
3003 }
3004 else if (ch == '\r') {
3005 *p++ = '\\';
3006 *p++ = 'r';
3007 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003008
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003009 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003010 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003012 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003013 *p++ = hexdigits[(ch >> 4) & 0x000F];
3014 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003015 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003016
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 /* Copy everything else as-is */
3018 else
3019 *p++ = (char) ch;
3020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021
3022 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003023 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
3024 Py_DECREF(repr);
3025 return NULL;
3026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027 return repr;
3028}
3029
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3031{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003032 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 if (!PyUnicode_Check(unicode)) {
3034 PyErr_BadArgument();
3035 return NULL;
3036 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003037 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3038 PyUnicode_GET_SIZE(unicode));
3039
3040 if (!s)
3041 return NULL;
3042 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3043 PyBytes_GET_SIZE(s));
3044 Py_DECREF(s);
3045 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046}
3047
3048/* --- Raw Unicode Escape Codec ------------------------------------------- */
3049
3050PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003051 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 const char *errors)
3053{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003055 Py_ssize_t startinpos;
3056 Py_ssize_t endinpos;
3057 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 const char *end;
3061 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003062 PyObject *errorHandler = NULL;
3063 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003064
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065 /* Escaped strings will always be longer than the resulting
3066 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003067 length after conversion to the true value. (But decoding error
3068 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 v = _PyUnicode_New(size);
3070 if (v == NULL)
3071 goto onError;
3072 if (size == 0)
3073 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003074 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 end = s + size;
3076 while (s < end) {
3077 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003078 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003080 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081
3082 /* Non-escape characters are interpreted as Unicode ordinals */
3083 if (*s != '\\') {
3084 *p++ = (unsigned char)*s++;
3085 continue;
3086 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003087 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088
3089 /* \u-escapes are only interpreted iff the number of leading
3090 backslashes if odd */
3091 bs = s;
3092 for (;s < end;) {
3093 if (*s != '\\')
3094 break;
3095 *p++ = (unsigned char)*s++;
3096 }
3097 if (((s - bs) & 1) == 0 ||
3098 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003099 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100 continue;
3101 }
3102 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003103 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 s++;
3105
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003106 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003108 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003109 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003110 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003111 endinpos = s-starts;
3112 if (unicode_decode_call_errorhandler(
3113 errors, &errorHandler,
3114 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003115 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003116 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003118 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 }
3120 x = (x<<4) & ~0xF;
3121 if (c >= '0' && c <= '9')
3122 x += c - '0';
3123 else if (c >= 'a' && c <= 'f')
3124 x += 10 + c - 'a';
3125 else
3126 x += 10 + c - 'A';
3127 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003128#ifndef Py_UNICODE_WIDE
3129 if (x > 0x10000) {
3130 if (unicode_decode_call_errorhandler(
3131 errors, &errorHandler,
3132 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003133 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003134 (PyObject **)&v, &outpos, &p))
3135 goto onError;
3136 }
3137#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003138 *p++ = x;
3139 nextByte:
3140 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003142 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003143 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003144 Py_XDECREF(errorHandler);
3145 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003147
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148 onError:
3149 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003150 Py_XDECREF(errorHandler);
3151 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 return NULL;
3153}
3154
3155PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003156 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157{
3158 PyObject *repr;
3159 char *p;
3160 char *q;
3161
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003162#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003163 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003164#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003165 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003166#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 if (repr == NULL)
3168 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003169 if (size == 0)
3170 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171
Walter Dörwald711005d2007-05-12 12:03:26 +00003172 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 while (size-- > 0) {
3174 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003175#ifdef Py_UNICODE_WIDE
3176 /* Map 32-bit characters to '\Uxxxxxxxx' */
3177 if (ch >= 0x10000) {
3178 *p++ = '\\';
3179 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003180 *p++ = hexdigits[(ch >> 28) & 0xf];
3181 *p++ = hexdigits[(ch >> 24) & 0xf];
3182 *p++ = hexdigits[(ch >> 20) & 0xf];
3183 *p++ = hexdigits[(ch >> 16) & 0xf];
3184 *p++ = hexdigits[(ch >> 12) & 0xf];
3185 *p++ = hexdigits[(ch >> 8) & 0xf];
3186 *p++ = hexdigits[(ch >> 4) & 0xf];
3187 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003188 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003189 else
3190#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 /* Map 16-bit characters to '\uxxxx' */
3192 if (ch >= 256) {
3193 *p++ = '\\';
3194 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003195 *p++ = hexdigits[(ch >> 12) & 0xf];
3196 *p++ = hexdigits[(ch >> 8) & 0xf];
3197 *p++ = hexdigits[(ch >> 4) & 0xf];
3198 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 }
3200 /* Copy everything else as-is */
3201 else
3202 *p++ = (char) ch;
3203 }
3204 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003205 if (PyBytes_Resize(repr, p - q)) {
3206 Py_DECREF(repr);
3207 return NULL;
3208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 return repr;
3210}
3211
3212PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3213{
Walter Dörwald711005d2007-05-12 12:03:26 +00003214 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003216 PyErr_BadArgument();
3217 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003219 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3220 PyUnicode_GET_SIZE(unicode));
3221
3222 if (!s)
3223 return NULL;
3224 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3225 PyBytes_GET_SIZE(s));
3226 Py_DECREF(s);
3227 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228}
3229
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003230/* --- Unicode Internal Codec ------------------------------------------- */
3231
3232PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003233 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003234 const char *errors)
3235{
3236 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003237 Py_ssize_t startinpos;
3238 Py_ssize_t endinpos;
3239 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003240 PyUnicodeObject *v;
3241 Py_UNICODE *p;
3242 const char *end;
3243 const char *reason;
3244 PyObject *errorHandler = NULL;
3245 PyObject *exc = NULL;
3246
Neal Norwitzd43069c2006-01-08 01:12:10 +00003247#ifdef Py_UNICODE_WIDE
3248 Py_UNICODE unimax = PyUnicode_GetMax();
3249#endif
3250
Thomas Wouters89f507f2006-12-13 04:49:30 +00003251 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003252 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3253 if (v == NULL)
3254 goto onError;
3255 if (PyUnicode_GetSize((PyObject *)v) == 0)
3256 return (PyObject *)v;
3257 p = PyUnicode_AS_UNICODE(v);
3258 end = s + size;
3259
3260 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003261 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003262 /* We have to sanity check the raw data, otherwise doom looms for
3263 some malformed UCS-4 data. */
3264 if (
3265 #ifdef Py_UNICODE_WIDE
3266 *p > unimax || *p < 0 ||
3267 #endif
3268 end-s < Py_UNICODE_SIZE
3269 )
3270 {
3271 startinpos = s - starts;
3272 if (end-s < Py_UNICODE_SIZE) {
3273 endinpos = end-starts;
3274 reason = "truncated input";
3275 }
3276 else {
3277 endinpos = s - starts + Py_UNICODE_SIZE;
3278 reason = "illegal code point (> 0x10FFFF)";
3279 }
3280 outpos = p - PyUnicode_AS_UNICODE(v);
3281 if (unicode_decode_call_errorhandler(
3282 errors, &errorHandler,
3283 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003284 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003285 (PyObject **)&v, &outpos, &p)) {
3286 goto onError;
3287 }
3288 }
3289 else {
3290 p++;
3291 s += Py_UNICODE_SIZE;
3292 }
3293 }
3294
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003295 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003296 goto onError;
3297 Py_XDECREF(errorHandler);
3298 Py_XDECREF(exc);
3299 return (PyObject *)v;
3300
3301 onError:
3302 Py_XDECREF(v);
3303 Py_XDECREF(errorHandler);
3304 Py_XDECREF(exc);
3305 return NULL;
3306}
3307
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308/* --- Latin-1 Codec ------------------------------------------------------ */
3309
3310PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003311 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 const char *errors)
3313{
3314 PyUnicodeObject *v;
3315 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003316
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003318 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003319 Py_UNICODE r = *(unsigned char*)s;
3320 return PyUnicode_FromUnicode(&r, 1);
3321 }
3322
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 v = _PyUnicode_New(size);
3324 if (v == NULL)
3325 goto onError;
3326 if (size == 0)
3327 return (PyObject *)v;
3328 p = PyUnicode_AS_UNICODE(v);
3329 while (size-- > 0)
3330 *p++ = (unsigned char)*s++;
3331 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003332
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333 onError:
3334 Py_XDECREF(v);
3335 return NULL;
3336}
3337
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338/* create or adjust a UnicodeEncodeError */
3339static void make_encode_exception(PyObject **exceptionObject,
3340 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003341 const Py_UNICODE *unicode, Py_ssize_t size,
3342 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345 if (*exceptionObject == NULL) {
3346 *exceptionObject = PyUnicodeEncodeError_Create(
3347 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348 }
3349 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003350 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3351 goto onError;
3352 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3353 goto onError;
3354 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3355 goto onError;
3356 return;
3357 onError:
3358 Py_DECREF(*exceptionObject);
3359 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 }
3361}
3362
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003363/* raises a UnicodeEncodeError */
3364static void raise_encode_exception(PyObject **exceptionObject,
3365 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003366 const Py_UNICODE *unicode, Py_ssize_t size,
3367 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003368 const char *reason)
3369{
3370 make_encode_exception(exceptionObject,
3371 encoding, unicode, size, startpos, endpos, reason);
3372 if (*exceptionObject != NULL)
3373 PyCodec_StrictErrors(*exceptionObject);
3374}
3375
3376/* error handling callback helper:
3377 build arguments, call the callback and check the arguments,
3378 put the result into newpos and return the replacement string, which
3379 has to be freed by the caller */
3380static PyObject *unicode_encode_call_errorhandler(const char *errors,
3381 PyObject **errorHandler,
3382 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003383 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3384 Py_ssize_t startpos, Py_ssize_t endpos,
3385 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003386{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003387 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003388
3389 PyObject *restuple;
3390 PyObject *resunicode;
3391
3392 if (*errorHandler == NULL) {
3393 *errorHandler = PyCodec_LookupError(errors);
3394 if (*errorHandler == NULL)
3395 return NULL;
3396 }
3397
3398 make_encode_exception(exceptionObject,
3399 encoding, unicode, size, startpos, endpos, reason);
3400 if (*exceptionObject == NULL)
3401 return NULL;
3402
3403 restuple = PyObject_CallFunctionObjArgs(
3404 *errorHandler, *exceptionObject, NULL);
3405 if (restuple == NULL)
3406 return NULL;
3407 if (!PyTuple_Check(restuple)) {
3408 PyErr_Format(PyExc_TypeError, &argparse[4]);
3409 Py_DECREF(restuple);
3410 return NULL;
3411 }
3412 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3413 &resunicode, newpos)) {
3414 Py_DECREF(restuple);
3415 return NULL;
3416 }
3417 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003418 *newpos = size+*newpos;
3419 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003420 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003421 Py_DECREF(restuple);
3422 return NULL;
3423 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003424 Py_INCREF(resunicode);
3425 Py_DECREF(restuple);
3426 return resunicode;
3427}
3428
3429static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003430 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431 const char *errors,
3432 int limit)
3433{
3434 /* output object */
3435 PyObject *res;
3436 /* pointers to the beginning and end+1 of input */
3437 const Py_UNICODE *startp = p;
3438 const Py_UNICODE *endp = p + size;
3439 /* pointer to the beginning of the unencodable characters */
3440 /* const Py_UNICODE *badp = NULL; */
3441 /* pointer into the output */
3442 char *str;
3443 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003444 Py_ssize_t respos = 0;
3445 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003446 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3447 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003448 PyObject *errorHandler = NULL;
3449 PyObject *exc = NULL;
3450 /* the following variable is used for caching string comparisons
3451 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3452 int known_errorHandler = -1;
3453
3454 /* allocate enough for a simple encoding without
3455 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003456 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 if (res == NULL)
3458 goto onError;
3459 if (size == 0)
3460 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003461 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462 ressize = size;
3463
3464 while (p<endp) {
3465 Py_UNICODE c = *p;
3466
3467 /* can we encode this? */
3468 if (c<limit) {
3469 /* no overflow check, because we know that the space is enough */
3470 *str++ = (char)c;
3471 ++p;
3472 }
3473 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003474 Py_ssize_t unicodepos = p-startp;
3475 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003477 Py_ssize_t repsize;
3478 Py_ssize_t newpos;
3479 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003480 Py_UNICODE *uni2;
3481 /* startpos for collecting unencodable chars */
3482 const Py_UNICODE *collstart = p;
3483 const Py_UNICODE *collend = p;
3484 /* find all unecodable characters */
3485 while ((collend < endp) && ((*collend)>=limit))
3486 ++collend;
3487 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3488 if (known_errorHandler==-1) {
3489 if ((errors==NULL) || (!strcmp(errors, "strict")))
3490 known_errorHandler = 1;
3491 else if (!strcmp(errors, "replace"))
3492 known_errorHandler = 2;
3493 else if (!strcmp(errors, "ignore"))
3494 known_errorHandler = 3;
3495 else if (!strcmp(errors, "xmlcharrefreplace"))
3496 known_errorHandler = 4;
3497 else
3498 known_errorHandler = 0;
3499 }
3500 switch (known_errorHandler) {
3501 case 1: /* strict */
3502 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3503 goto onError;
3504 case 2: /* replace */
3505 while (collstart++<collend)
3506 *str++ = '?'; /* fall through */
3507 case 3: /* ignore */
3508 p = collend;
3509 break;
3510 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003511 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 /* determine replacement size (temporarily (mis)uses p) */
3513 for (p = collstart, repsize = 0; p < collend; ++p) {
3514 if (*p<10)
3515 repsize += 2+1+1;
3516 else if (*p<100)
3517 repsize += 2+2+1;
3518 else if (*p<1000)
3519 repsize += 2+3+1;
3520 else if (*p<10000)
3521 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003522#ifndef Py_UNICODE_WIDE
3523 else
3524 repsize += 2+5+1;
3525#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 else if (*p<100000)
3527 repsize += 2+5+1;
3528 else if (*p<1000000)
3529 repsize += 2+6+1;
3530 else
3531 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003532#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 }
3534 requiredsize = respos+repsize+(endp-collend);
3535 if (requiredsize > ressize) {
3536 if (requiredsize<2*ressize)
3537 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003538 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003540 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 ressize = requiredsize;
3542 }
3543 /* generate replacement (temporarily (mis)uses p) */
3544 for (p = collstart; p < collend; ++p) {
3545 str += sprintf(str, "&#%d;", (int)*p);
3546 }
3547 p = collend;
3548 break;
3549 default:
3550 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3551 encoding, reason, startp, size, &exc,
3552 collstart-startp, collend-startp, &newpos);
3553 if (repunicode == NULL)
3554 goto onError;
3555 /* need more space? (at least enough for what we
3556 have+the replacement+the rest of the string, so
3557 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003558 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 repsize = PyUnicode_GET_SIZE(repunicode);
3560 requiredsize = respos+repsize+(endp-collend);
3561 if (requiredsize > ressize) {
3562 if (requiredsize<2*ressize)
3563 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003564 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 Py_DECREF(repunicode);
3566 goto onError;
3567 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003568 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 ressize = requiredsize;
3570 }
3571 /* check if there is anything unencodable in the replacement
3572 and copy it to the output */
3573 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3574 c = *uni2;
3575 if (c >= limit) {
3576 raise_encode_exception(&exc, encoding, startp, size,
3577 unicodepos, unicodepos+1, reason);
3578 Py_DECREF(repunicode);
3579 goto onError;
3580 }
3581 *str = (char)c;
3582 }
3583 p = startp + newpos;
3584 Py_DECREF(repunicode);
3585 }
3586 }
3587 }
3588 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003589 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 if (respos<ressize)
3591 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003592 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 Py_XDECREF(errorHandler);
3594 Py_XDECREF(exc);
3595 return res;
3596
3597 onError:
3598 Py_XDECREF(res);
3599 Py_XDECREF(errorHandler);
3600 Py_XDECREF(exc);
3601 return NULL;
3602}
3603
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003605 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606 const char *errors)
3607{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003608 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609}
3610
3611PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3612{
3613 if (!PyUnicode_Check(unicode)) {
3614 PyErr_BadArgument();
3615 return NULL;
3616 }
3617 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3618 PyUnicode_GET_SIZE(unicode),
3619 NULL);
3620}
3621
3622/* --- 7-bit ASCII Codec -------------------------------------------------- */
3623
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003625 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 const char *errors)
3627{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 PyUnicodeObject *v;
3630 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003631 Py_ssize_t startinpos;
3632 Py_ssize_t endinpos;
3633 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 const char *e;
3635 PyObject *errorHandler = NULL;
3636 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003637
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003639 if (size == 1 && *(unsigned char*)s < 128) {
3640 Py_UNICODE r = *(unsigned char*)s;
3641 return PyUnicode_FromUnicode(&r, 1);
3642 }
Tim Petersced69f82003-09-16 20:30:58 +00003643
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644 v = _PyUnicode_New(size);
3645 if (v == NULL)
3646 goto onError;
3647 if (size == 0)
3648 return (PyObject *)v;
3649 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 e = s + size;
3651 while (s < e) {
3652 register unsigned char c = (unsigned char)*s;
3653 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 ++s;
3656 }
3657 else {
3658 startinpos = s-starts;
3659 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003660 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 if (unicode_decode_call_errorhandler(
3662 errors, &errorHandler,
3663 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003664 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003669 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003670 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003671 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003672 Py_XDECREF(errorHandler);
3673 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003675
Guido van Rossumd57fd912000-03-10 22:53:23 +00003676 onError:
3677 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 Py_XDECREF(errorHandler);
3679 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 return NULL;
3681}
3682
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003684 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 const char *errors)
3686{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688}
3689
3690PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3691{
3692 if (!PyUnicode_Check(unicode)) {
3693 PyErr_BadArgument();
3694 return NULL;
3695 }
3696 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3697 PyUnicode_GET_SIZE(unicode),
3698 NULL);
3699}
3700
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003701#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003702
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003703/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003704
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003705#if SIZEOF_INT < SIZEOF_SSIZE_T
3706#define NEED_RETRY
3707#endif
3708
3709/* XXX This code is limited to "true" double-byte encodings, as
3710 a) it assumes an incomplete character consists of a single byte, and
3711 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3712 encodings, see IsDBCSLeadByteEx documentation. */
3713
3714static int is_dbcs_lead_byte(const char *s, int offset)
3715{
3716 const char *curr = s + offset;
3717
3718 if (IsDBCSLeadByte(*curr)) {
3719 const char *prev = CharPrev(s, curr);
3720 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3721 }
3722 return 0;
3723}
3724
3725/*
3726 * Decode MBCS string into unicode object. If 'final' is set, converts
3727 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3728 */
3729static int decode_mbcs(PyUnicodeObject **v,
3730 const char *s, /* MBCS string */
3731 int size, /* sizeof MBCS string */
3732 int final)
3733{
3734 Py_UNICODE *p;
3735 Py_ssize_t n = 0;
3736 int usize = 0;
3737
3738 assert(size >= 0);
3739
3740 /* Skip trailing lead-byte unless 'final' is set */
3741 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3742 --size;
3743
3744 /* First get the size of the result */
3745 if (size > 0) {
3746 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3747 if (usize == 0) {
3748 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3749 return -1;
3750 }
3751 }
3752
3753 if (*v == NULL) {
3754 /* Create unicode object */
3755 *v = _PyUnicode_New(usize);
3756 if (*v == NULL)
3757 return -1;
3758 }
3759 else {
3760 /* Extend unicode object */
3761 n = PyUnicode_GET_SIZE(*v);
3762 if (_PyUnicode_Resize(v, n + usize) < 0)
3763 return -1;
3764 }
3765
3766 /* Do the conversion */
3767 if (size > 0) {
3768 p = PyUnicode_AS_UNICODE(*v) + n;
3769 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3770 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3771 return -1;
3772 }
3773 }
3774
3775 return size;
3776}
3777
3778PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3779 Py_ssize_t size,
3780 const char *errors,
3781 Py_ssize_t *consumed)
3782{
3783 PyUnicodeObject *v = NULL;
3784 int done;
3785
3786 if (consumed)
3787 *consumed = 0;
3788
3789#ifdef NEED_RETRY
3790 retry:
3791 if (size > INT_MAX)
3792 done = decode_mbcs(&v, s, INT_MAX, 0);
3793 else
3794#endif
3795 done = decode_mbcs(&v, s, (int)size, !consumed);
3796
3797 if (done < 0) {
3798 Py_XDECREF(v);
3799 return NULL;
3800 }
3801
3802 if (consumed)
3803 *consumed += done;
3804
3805#ifdef NEED_RETRY
3806 if (size > INT_MAX) {
3807 s += done;
3808 size -= done;
3809 goto retry;
3810 }
3811#endif
3812
3813 return (PyObject *)v;
3814}
3815
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003816PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003817 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003818 const char *errors)
3819{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003820 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3821}
3822
3823/*
3824 * Convert unicode into string object (MBCS).
3825 * Returns 0 if succeed, -1 otherwise.
3826 */
3827static int encode_mbcs(PyObject **repr,
3828 const Py_UNICODE *p, /* unicode */
3829 int size) /* size of unicode */
3830{
3831 int mbcssize = 0;
3832 Py_ssize_t n = 0;
3833
3834 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003835
3836 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003837 if (size > 0) {
3838 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3839 if (mbcssize == 0) {
3840 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3841 return -1;
3842 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003843 }
3844
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003845 if (*repr == NULL) {
3846 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003847 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003848 if (*repr == NULL)
3849 return -1;
3850 }
3851 else {
3852 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003853 n = PyBytes_Size(*repr);
3854 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003855 return -1;
3856 }
3857
3858 /* Do the conversion */
3859 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003860 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003861 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3862 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3863 return -1;
3864 }
3865 }
3866
3867 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003868}
3869
3870PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003871 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003872 const char *errors)
3873{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003874 PyObject *repr = NULL;
3875 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003876
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003877#ifdef NEED_RETRY
3878 retry:
3879 if (size > INT_MAX)
3880 ret = encode_mbcs(&repr, p, INT_MAX);
3881 else
3882#endif
3883 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003884
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003885 if (ret < 0) {
3886 Py_XDECREF(repr);
3887 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003888 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003889
3890#ifdef NEED_RETRY
3891 if (size > INT_MAX) {
3892 p += INT_MAX;
3893 size -= INT_MAX;
3894 goto retry;
3895 }
3896#endif
3897
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003898 return repr;
3899}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003900
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003901PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3902{
3903 if (!PyUnicode_Check(unicode)) {
3904 PyErr_BadArgument();
3905 return NULL;
3906 }
3907 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3908 PyUnicode_GET_SIZE(unicode),
3909 NULL);
3910}
3911
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003912#undef NEED_RETRY
3913
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003914#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003915
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916/* --- Character Mapping Codec -------------------------------------------- */
3917
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003919 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920 PyObject *mapping,
3921 const char *errors)
3922{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003924 Py_ssize_t startinpos;
3925 Py_ssize_t endinpos;
3926 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 PyUnicodeObject *v;
3929 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003930 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 PyObject *errorHandler = NULL;
3932 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003933 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003934 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003935
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 /* Default to Latin-1 */
3937 if (mapping == NULL)
3938 return PyUnicode_DecodeLatin1(s, size, errors);
3939
3940 v = _PyUnicode_New(size);
3941 if (v == NULL)
3942 goto onError;
3943 if (size == 0)
3944 return (PyObject *)v;
3945 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003947 if (PyUnicode_CheckExact(mapping)) {
3948 mapstring = PyUnicode_AS_UNICODE(mapping);
3949 maplen = PyUnicode_GET_SIZE(mapping);
3950 while (s < e) {
3951 unsigned char ch = *s;
3952 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003954 if (ch < maplen)
3955 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003957 if (x == 0xfffe) {
3958 /* undefined mapping */
3959 outpos = p-PyUnicode_AS_UNICODE(v);
3960 startinpos = s-starts;
3961 endinpos = startinpos+1;
3962 if (unicode_decode_call_errorhandler(
3963 errors, &errorHandler,
3964 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003965 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003966 (PyObject **)&v, &outpos, &p)) {
3967 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003968 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003969 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003970 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003971 *p++ = x;
3972 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003974 }
3975 else {
3976 while (s < e) {
3977 unsigned char ch = *s;
3978 PyObject *w, *x;
3979
3980 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3981 w = PyInt_FromLong((long)ch);
3982 if (w == NULL)
3983 goto onError;
3984 x = PyObject_GetItem(mapping, w);
3985 Py_DECREF(w);
3986 if (x == NULL) {
3987 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3988 /* No mapping found means: mapping is undefined. */
3989 PyErr_Clear();
3990 x = Py_None;
3991 Py_INCREF(x);
3992 } else
3993 goto onError;
3994 }
3995
3996 /* Apply mapping */
3997 if (PyInt_Check(x)) {
3998 long value = PyInt_AS_LONG(x);
3999 if (value < 0 || value > 65535) {
4000 PyErr_SetString(PyExc_TypeError,
4001 "character mapping must be in range(65536)");
4002 Py_DECREF(x);
4003 goto onError;
4004 }
4005 *p++ = (Py_UNICODE)value;
4006 }
4007 else if (x == Py_None) {
4008 /* undefined mapping */
4009 outpos = p-PyUnicode_AS_UNICODE(v);
4010 startinpos = s-starts;
4011 endinpos = startinpos+1;
4012 if (unicode_decode_call_errorhandler(
4013 errors, &errorHandler,
4014 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004015 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004016 (PyObject **)&v, &outpos, &p)) {
4017 Py_DECREF(x);
4018 goto onError;
4019 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004020 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004021 continue;
4022 }
4023 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004024 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004025
4026 if (targetsize == 1)
4027 /* 1-1 mapping */
4028 *p++ = *PyUnicode_AS_UNICODE(x);
4029
4030 else if (targetsize > 1) {
4031 /* 1-n mapping */
4032 if (targetsize > extrachars) {
4033 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004034 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4035 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004036 (targetsize << 2);
4037 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004038 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004039 if (_PyUnicode_Resize(&v,
4040 PyUnicode_GET_SIZE(v) + needed) < 0) {
4041 Py_DECREF(x);
4042 goto onError;
4043 }
4044 p = PyUnicode_AS_UNICODE(v) + oldpos;
4045 }
4046 Py_UNICODE_COPY(p,
4047 PyUnicode_AS_UNICODE(x),
4048 targetsize);
4049 p += targetsize;
4050 extrachars -= targetsize;
4051 }
4052 /* 1-0 mapping: skip the character */
4053 }
4054 else {
4055 /* wrong return value */
4056 PyErr_SetString(PyExc_TypeError,
4057 "character mapping must return integer, None or unicode");
4058 Py_DECREF(x);
4059 goto onError;
4060 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004062 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 }
4065 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004066 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 Py_XDECREF(errorHandler);
4069 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004071
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073 Py_XDECREF(errorHandler);
4074 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 Py_XDECREF(v);
4076 return NULL;
4077}
4078
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004079/* Charmap encoding: the lookup table */
4080
4081struct encoding_map{
4082 PyObject_HEAD
4083 unsigned char level1[32];
4084 int count2, count3;
4085 unsigned char level23[1];
4086};
4087
4088static PyObject*
4089encoding_map_size(PyObject *obj, PyObject* args)
4090{
4091 struct encoding_map *map = (struct encoding_map*)obj;
4092 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4093 128*map->count3);
4094}
4095
4096static PyMethodDef encoding_map_methods[] = {
4097 {"size", encoding_map_size, METH_NOARGS,
4098 PyDoc_STR("Return the size (in bytes) of this object") },
4099 { 0 }
4100};
4101
4102static void
4103encoding_map_dealloc(PyObject* o)
4104{
4105 PyObject_FREE(o);
4106}
4107
4108static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004109 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004110 "EncodingMap", /*tp_name*/
4111 sizeof(struct encoding_map), /*tp_basicsize*/
4112 0, /*tp_itemsize*/
4113 /* methods */
4114 encoding_map_dealloc, /*tp_dealloc*/
4115 0, /*tp_print*/
4116 0, /*tp_getattr*/
4117 0, /*tp_setattr*/
4118 0, /*tp_compare*/
4119 0, /*tp_repr*/
4120 0, /*tp_as_number*/
4121 0, /*tp_as_sequence*/
4122 0, /*tp_as_mapping*/
4123 0, /*tp_hash*/
4124 0, /*tp_call*/
4125 0, /*tp_str*/
4126 0, /*tp_getattro*/
4127 0, /*tp_setattro*/
4128 0, /*tp_as_buffer*/
4129 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4130 0, /*tp_doc*/
4131 0, /*tp_traverse*/
4132 0, /*tp_clear*/
4133 0, /*tp_richcompare*/
4134 0, /*tp_weaklistoffset*/
4135 0, /*tp_iter*/
4136 0, /*tp_iternext*/
4137 encoding_map_methods, /*tp_methods*/
4138 0, /*tp_members*/
4139 0, /*tp_getset*/
4140 0, /*tp_base*/
4141 0, /*tp_dict*/
4142 0, /*tp_descr_get*/
4143 0, /*tp_descr_set*/
4144 0, /*tp_dictoffset*/
4145 0, /*tp_init*/
4146 0, /*tp_alloc*/
4147 0, /*tp_new*/
4148 0, /*tp_free*/
4149 0, /*tp_is_gc*/
4150};
4151
4152PyObject*
4153PyUnicode_BuildEncodingMap(PyObject* string)
4154{
4155 Py_UNICODE *decode;
4156 PyObject *result;
4157 struct encoding_map *mresult;
4158 int i;
4159 int need_dict = 0;
4160 unsigned char level1[32];
4161 unsigned char level2[512];
4162 unsigned char *mlevel1, *mlevel2, *mlevel3;
4163 int count2 = 0, count3 = 0;
4164
4165 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4166 PyErr_BadArgument();
4167 return NULL;
4168 }
4169 decode = PyUnicode_AS_UNICODE(string);
4170 memset(level1, 0xFF, sizeof level1);
4171 memset(level2, 0xFF, sizeof level2);
4172
4173 /* If there isn't a one-to-one mapping of NULL to \0,
4174 or if there are non-BMP characters, we need to use
4175 a mapping dictionary. */
4176 if (decode[0] != 0)
4177 need_dict = 1;
4178 for (i = 1; i < 256; i++) {
4179 int l1, l2;
4180 if (decode[i] == 0
4181 #ifdef Py_UNICODE_WIDE
4182 || decode[i] > 0xFFFF
4183 #endif
4184 ) {
4185 need_dict = 1;
4186 break;
4187 }
4188 if (decode[i] == 0xFFFE)
4189 /* unmapped character */
4190 continue;
4191 l1 = decode[i] >> 11;
4192 l2 = decode[i] >> 7;
4193 if (level1[l1] == 0xFF)
4194 level1[l1] = count2++;
4195 if (level2[l2] == 0xFF)
4196 level2[l2] = count3++;
4197 }
4198
4199 if (count2 >= 0xFF || count3 >= 0xFF)
4200 need_dict = 1;
4201
4202 if (need_dict) {
4203 PyObject *result = PyDict_New();
4204 PyObject *key, *value;
4205 if (!result)
4206 return NULL;
4207 for (i = 0; i < 256; i++) {
4208 key = value = NULL;
4209 key = PyInt_FromLong(decode[i]);
4210 value = PyInt_FromLong(i);
4211 if (!key || !value)
4212 goto failed1;
4213 if (PyDict_SetItem(result, key, value) == -1)
4214 goto failed1;
4215 Py_DECREF(key);
4216 Py_DECREF(value);
4217 }
4218 return result;
4219 failed1:
4220 Py_XDECREF(key);
4221 Py_XDECREF(value);
4222 Py_DECREF(result);
4223 return NULL;
4224 }
4225
4226 /* Create a three-level trie */
4227 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4228 16*count2 + 128*count3 - 1);
4229 if (!result)
4230 return PyErr_NoMemory();
4231 PyObject_Init(result, &EncodingMapType);
4232 mresult = (struct encoding_map*)result;
4233 mresult->count2 = count2;
4234 mresult->count3 = count3;
4235 mlevel1 = mresult->level1;
4236 mlevel2 = mresult->level23;
4237 mlevel3 = mresult->level23 + 16*count2;
4238 memcpy(mlevel1, level1, 32);
4239 memset(mlevel2, 0xFF, 16*count2);
4240 memset(mlevel3, 0, 128*count3);
4241 count3 = 0;
4242 for (i = 1; i < 256; i++) {
4243 int o1, o2, o3, i2, i3;
4244 if (decode[i] == 0xFFFE)
4245 /* unmapped character */
4246 continue;
4247 o1 = decode[i]>>11;
4248 o2 = (decode[i]>>7) & 0xF;
4249 i2 = 16*mlevel1[o1] + o2;
4250 if (mlevel2[i2] == 0xFF)
4251 mlevel2[i2] = count3++;
4252 o3 = decode[i] & 0x7F;
4253 i3 = 128*mlevel2[i2] + o3;
4254 mlevel3[i3] = i;
4255 }
4256 return result;
4257}
4258
4259static int
4260encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4261{
4262 struct encoding_map *map = (struct encoding_map*)mapping;
4263 int l1 = c>>11;
4264 int l2 = (c>>7) & 0xF;
4265 int l3 = c & 0x7F;
4266 int i;
4267
4268#ifdef Py_UNICODE_WIDE
4269 if (c > 0xFFFF) {
4270 return -1;
4271 }
4272#endif
4273 if (c == 0)
4274 return 0;
4275 /* level 1*/
4276 i = map->level1[l1];
4277 if (i == 0xFF) {
4278 return -1;
4279 }
4280 /* level 2*/
4281 i = map->level23[16*i+l2];
4282 if (i == 0xFF) {
4283 return -1;
4284 }
4285 /* level 3 */
4286 i = map->level23[16*map->count2 + 128*i + l3];
4287 if (i == 0) {
4288 return -1;
4289 }
4290 return i;
4291}
4292
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293/* Lookup the character ch in the mapping. If the character
4294 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004295 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298 PyObject *w = PyInt_FromLong((long)c);
4299 PyObject *x;
4300
4301 if (w == NULL)
4302 return NULL;
4303 x = PyObject_GetItem(mapping, w);
4304 Py_DECREF(w);
4305 if (x == NULL) {
4306 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4307 /* No mapping found means: mapping is undefined. */
4308 PyErr_Clear();
4309 x = Py_None;
4310 Py_INCREF(x);
4311 return x;
4312 } else
4313 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004315 else if (x == Py_None)
4316 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317 else if (PyInt_Check(x)) {
4318 long value = PyInt_AS_LONG(x);
4319 if (value < 0 || value > 255) {
4320 PyErr_SetString(PyExc_TypeError,
4321 "character mapping must be in range(256)");
4322 Py_DECREF(x);
4323 return NULL;
4324 }
4325 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 else if (PyString_Check(x))
4328 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004330 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004331 PyErr_Format(PyExc_TypeError,
4332 "character mapping must return integer, None or str8, not %.400s",
4333 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004334 Py_DECREF(x);
4335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 }
4337}
4338
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004339static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004340charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004341{
Walter Dörwald827b0552007-05-12 13:23:53 +00004342 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004343 /* exponentially overallocate to minimize reallocations */
4344 if (requiredsize < 2*outsize)
4345 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004346 if (PyBytes_Resize(outobj, requiredsize)) {
4347 Py_DECREF(outobj);
4348 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004349 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004350 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004351}
4352
4353typedef enum charmapencode_result {
4354 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4355}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004357 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004358 space is available. Return a new reference to the object that
4359 was put in the output buffer, or Py_None, if the mapping was undefined
4360 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004361 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004362static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004363charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004364 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004365{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004366 PyObject *rep;
4367 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004368 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004370 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004371 int res = encoding_map_lookup(c, mapping);
4372 Py_ssize_t requiredsize = *outpos+1;
4373 if (res == -1)
4374 return enc_FAILED;
4375 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004376 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004377 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004378 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004379 outstart[(*outpos)++] = (char)res;
4380 return enc_SUCCESS;
4381 }
4382
4383 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004385 return enc_EXCEPTION;
4386 else if (rep==Py_None) {
4387 Py_DECREF(rep);
4388 return enc_FAILED;
4389 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004390 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004391 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004392 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004393 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004395 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004397 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4399 }
4400 else {
4401 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004402 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4403 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004404 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004405 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004407 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004409 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 memcpy(outstart + *outpos, repchars, repsize);
4411 *outpos += repsize;
4412 }
4413 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004414 Py_DECREF(rep);
4415 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416}
4417
4418/* handle an error in PyUnicode_EncodeCharmap
4419 Return 0 on success, -1 on error */
4420static
4421int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004422 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004424 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004425 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426{
4427 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004428 Py_ssize_t repsize;
4429 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430 Py_UNICODE *uni2;
4431 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004432 Py_ssize_t collstartpos = *inpos;
4433 Py_ssize_t collendpos = *inpos+1;
4434 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 char *encoding = "charmap";
4436 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004437 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 /* find all unencodable characters */
4440 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004441 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004442 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004443 int res = encoding_map_lookup(p[collendpos], mapping);
4444 if (res != -1)
4445 break;
4446 ++collendpos;
4447 continue;
4448 }
4449
4450 rep = charmapencode_lookup(p[collendpos], mapping);
4451 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004453 else if (rep!=Py_None) {
4454 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 break;
4456 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004457 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 ++collendpos;
4459 }
4460 /* cache callback name lookup
4461 * (if not done yet, i.e. it's the first error) */
4462 if (*known_errorHandler==-1) {
4463 if ((errors==NULL) || (!strcmp(errors, "strict")))
4464 *known_errorHandler = 1;
4465 else if (!strcmp(errors, "replace"))
4466 *known_errorHandler = 2;
4467 else if (!strcmp(errors, "ignore"))
4468 *known_errorHandler = 3;
4469 else if (!strcmp(errors, "xmlcharrefreplace"))
4470 *known_errorHandler = 4;
4471 else
4472 *known_errorHandler = 0;
4473 }
4474 switch (*known_errorHandler) {
4475 case 1: /* strict */
4476 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4477 return -1;
4478 case 2: /* replace */
4479 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4480 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004481 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482 return -1;
4483 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004484 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4486 return -1;
4487 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488 }
4489 /* fall through */
4490 case 3: /* ignore */
4491 *inpos = collendpos;
4492 break;
4493 case 4: /* xmlcharrefreplace */
4494 /* generate replacement (temporarily (mis)uses p) */
4495 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4496 char buffer[2+29+1+1];
4497 char *cp;
4498 sprintf(buffer, "&#%d;", (int)p[collpos]);
4499 for (cp = buffer; *cp; ++cp) {
4500 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004501 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004502 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004503 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4505 return -1;
4506 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507 }
4508 }
4509 *inpos = collendpos;
4510 break;
4511 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004512 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513 encoding, reason, p, size, exceptionObject,
4514 collstartpos, collendpos, &newpos);
4515 if (repunicode == NULL)
4516 return -1;
4517 /* generate replacement */
4518 repsize = PyUnicode_GET_SIZE(repunicode);
4519 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4520 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004521 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 return -1;
4523 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004524 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4527 return -1;
4528 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 }
4530 *inpos = newpos;
4531 Py_DECREF(repunicode);
4532 }
4533 return 0;
4534}
4535
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004537 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538 PyObject *mapping,
4539 const char *errors)
4540{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 /* output object */
4542 PyObject *res = NULL;
4543 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004544 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004546 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547 PyObject *errorHandler = NULL;
4548 PyObject *exc = NULL;
4549 /* the following variable is used for caching string comparisons
4550 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4551 * 3=ignore, 4=xmlcharrefreplace */
4552 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553
4554 /* Default to Latin-1 */
4555 if (mapping == NULL)
4556 return PyUnicode_EncodeLatin1(p, size, errors);
4557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 /* allocate enough for a simple encoding without
4559 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004560 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 if (res == NULL)
4562 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004563 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 while (inpos<size) {
4567 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004568 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004569 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004571 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 if (charmap_encoding_error(p, size, &inpos, mapping,
4573 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004574 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004575 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004576 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579 else
4580 /* done with this character => adjust input position */
4581 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004585 if (respos<PyBytes_GET_SIZE(res)) {
4586 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 goto onError;
4588 }
4589 Py_XDECREF(exc);
4590 Py_XDECREF(errorHandler);
4591 return res;
4592
4593 onError:
4594 Py_XDECREF(res);
4595 Py_XDECREF(exc);
4596 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597 return NULL;
4598}
4599
4600PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4601 PyObject *mapping)
4602{
4603 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4604 PyErr_BadArgument();
4605 return NULL;
4606 }
4607 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4608 PyUnicode_GET_SIZE(unicode),
4609 mapping,
4610 NULL);
4611}
4612
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613/* create or adjust a UnicodeTranslateError */
4614static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004615 const Py_UNICODE *unicode, Py_ssize_t size,
4616 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619 if (*exceptionObject == NULL) {
4620 *exceptionObject = PyUnicodeTranslateError_Create(
4621 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622 }
4623 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4625 goto onError;
4626 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4627 goto onError;
4628 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4629 goto onError;
4630 return;
4631 onError:
4632 Py_DECREF(*exceptionObject);
4633 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634 }
4635}
4636
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004637/* raises a UnicodeTranslateError */
4638static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004639 const Py_UNICODE *unicode, Py_ssize_t size,
4640 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 const char *reason)
4642{
4643 make_translate_exception(exceptionObject,
4644 unicode, size, startpos, endpos, reason);
4645 if (*exceptionObject != NULL)
4646 PyCodec_StrictErrors(*exceptionObject);
4647}
4648
4649/* error handling callback helper:
4650 build arguments, call the callback and check the arguments,
4651 put the result into newpos and return the replacement string, which
4652 has to be freed by the caller */
4653static PyObject *unicode_translate_call_errorhandler(const char *errors,
4654 PyObject **errorHandler,
4655 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004656 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4657 Py_ssize_t startpos, Py_ssize_t endpos,
4658 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004660 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004662 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663 PyObject *restuple;
4664 PyObject *resunicode;
4665
4666 if (*errorHandler == NULL) {
4667 *errorHandler = PyCodec_LookupError(errors);
4668 if (*errorHandler == NULL)
4669 return NULL;
4670 }
4671
4672 make_translate_exception(exceptionObject,
4673 unicode, size, startpos, endpos, reason);
4674 if (*exceptionObject == NULL)
4675 return NULL;
4676
4677 restuple = PyObject_CallFunctionObjArgs(
4678 *errorHandler, *exceptionObject, NULL);
4679 if (restuple == NULL)
4680 return NULL;
4681 if (!PyTuple_Check(restuple)) {
4682 PyErr_Format(PyExc_TypeError, &argparse[4]);
4683 Py_DECREF(restuple);
4684 return NULL;
4685 }
4686 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004687 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004688 Py_DECREF(restuple);
4689 return NULL;
4690 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004691 if (i_newpos<0)
4692 *newpos = size+i_newpos;
4693 else
4694 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004695 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004696 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004697 Py_DECREF(restuple);
4698 return NULL;
4699 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700 Py_INCREF(resunicode);
4701 Py_DECREF(restuple);
4702 return resunicode;
4703}
4704
4705/* Lookup the character ch in the mapping and put the result in result,
4706 which must be decrefed by the caller.
4707 Return 0 on success, -1 on error */
4708static
4709int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4710{
4711 PyObject *w = PyInt_FromLong((long)c);
4712 PyObject *x;
4713
4714 if (w == NULL)
4715 return -1;
4716 x = PyObject_GetItem(mapping, w);
4717 Py_DECREF(w);
4718 if (x == NULL) {
4719 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4720 /* No mapping found means: use 1:1 mapping. */
4721 PyErr_Clear();
4722 *result = NULL;
4723 return 0;
4724 } else
4725 return -1;
4726 }
4727 else if (x == Py_None) {
4728 *result = x;
4729 return 0;
4730 }
4731 else if (PyInt_Check(x)) {
4732 long value = PyInt_AS_LONG(x);
4733 long max = PyUnicode_GetMax();
4734 if (value < 0 || value > max) {
4735 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004736 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737 Py_DECREF(x);
4738 return -1;
4739 }
4740 *result = x;
4741 return 0;
4742 }
4743 else if (PyUnicode_Check(x)) {
4744 *result = x;
4745 return 0;
4746 }
4747 else {
4748 /* wrong return value */
4749 PyErr_SetString(PyExc_TypeError,
4750 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004751 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004752 return -1;
4753 }
4754}
4755/* ensure that *outobj is at least requiredsize characters long,
4756if not reallocate and adjust various state variables.
4757Return 0 on success, -1 on error */
4758static
Walter Dörwald4894c302003-10-24 14:25:28 +00004759int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004760 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004761{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004762 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004763 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004764 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004765 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004767 if (requiredsize < 2 * oldsize)
4768 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004769 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 return -1;
4771 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772 }
4773 return 0;
4774}
4775/* lookup the character, put the result in the output string and adjust
4776 various state variables. Return a new reference to the object that
4777 was put in the output buffer in *result, or Py_None, if the mapping was
4778 undefined (in which case no character was written).
4779 The called must decref result.
4780 Return 0 on success, -1 on error. */
4781static
Walter Dörwald4894c302003-10-24 14:25:28 +00004782int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004783 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004784 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004785{
Walter Dörwald4894c302003-10-24 14:25:28 +00004786 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004787 return -1;
4788 if (*res==NULL) {
4789 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004790 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791 }
4792 else if (*res==Py_None)
4793 ;
4794 else if (PyInt_Check(*res)) {
4795 /* no overflow check, because we know that the space is enough */
4796 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4797 }
4798 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004799 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004800 if (repsize==1) {
4801 /* no overflow check, because we know that the space is enough */
4802 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4803 }
4804 else if (repsize!=0) {
4805 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004806 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004807 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004808 repsize - 1;
4809 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004810 return -1;
4811 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4812 *outp += repsize;
4813 }
4814 }
4815 else
4816 return -1;
4817 return 0;
4818}
4819
4820PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004821 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 PyObject *mapping,
4823 const char *errors)
4824{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004825 /* output object */
4826 PyObject *res = NULL;
4827 /* pointers to the beginning and end+1 of input */
4828 const Py_UNICODE *startp = p;
4829 const Py_UNICODE *endp = p + size;
4830 /* pointer into the output */
4831 Py_UNICODE *str;
4832 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004833 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834 char *reason = "character maps to <undefined>";
4835 PyObject *errorHandler = NULL;
4836 PyObject *exc = NULL;
4837 /* the following variable is used for caching string comparisons
4838 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4839 * 3=ignore, 4=xmlcharrefreplace */
4840 int known_errorHandler = -1;
4841
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 if (mapping == NULL) {
4843 PyErr_BadArgument();
4844 return NULL;
4845 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846
4847 /* allocate enough for a simple 1:1 translation without
4848 replacements, if we need more, we'll resize */
4849 res = PyUnicode_FromUnicode(NULL, size);
4850 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004851 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004853 return res;
4854 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 while (p<endp) {
4857 /* try to encode it */
4858 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004859 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861 goto onError;
4862 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004863 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004864 if (x!=Py_None) /* it worked => adjust input pointer */
4865 ++p;
4866 else { /* untranslatable character */
4867 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004868 Py_ssize_t repsize;
4869 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870 Py_UNICODE *uni2;
4871 /* startpos for collecting untranslatable chars */
4872 const Py_UNICODE *collstart = p;
4873 const Py_UNICODE *collend = p+1;
4874 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 /* find all untranslatable characters */
4877 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004878 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004879 goto onError;
4880 Py_XDECREF(x);
4881 if (x!=Py_None)
4882 break;
4883 ++collend;
4884 }
4885 /* cache callback name lookup
4886 * (if not done yet, i.e. it's the first error) */
4887 if (known_errorHandler==-1) {
4888 if ((errors==NULL) || (!strcmp(errors, "strict")))
4889 known_errorHandler = 1;
4890 else if (!strcmp(errors, "replace"))
4891 known_errorHandler = 2;
4892 else if (!strcmp(errors, "ignore"))
4893 known_errorHandler = 3;
4894 else if (!strcmp(errors, "xmlcharrefreplace"))
4895 known_errorHandler = 4;
4896 else
4897 known_errorHandler = 0;
4898 }
4899 switch (known_errorHandler) {
4900 case 1: /* strict */
4901 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4902 goto onError;
4903 case 2: /* replace */
4904 /* No need to check for space, this is a 1:1 replacement */
4905 for (coll = collstart; coll<collend; ++coll)
4906 *str++ = '?';
4907 /* fall through */
4908 case 3: /* ignore */
4909 p = collend;
4910 break;
4911 case 4: /* xmlcharrefreplace */
4912 /* generate replacement (temporarily (mis)uses p) */
4913 for (p = collstart; p < collend; ++p) {
4914 char buffer[2+29+1+1];
4915 char *cp;
4916 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004917 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004918 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4919 goto onError;
4920 for (cp = buffer; *cp; ++cp)
4921 *str++ = *cp;
4922 }
4923 p = collend;
4924 break;
4925 default:
4926 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4927 reason, startp, size, &exc,
4928 collstart-startp, collend-startp, &newpos);
4929 if (repunicode == NULL)
4930 goto onError;
4931 /* generate replacement */
4932 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004933 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004934 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4935 Py_DECREF(repunicode);
4936 goto onError;
4937 }
4938 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4939 *str++ = *uni2;
4940 p = startp + newpos;
4941 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942 }
4943 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004945 /* Resize if we allocated to much */
4946 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004947 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004948 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004949 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004950 }
4951 Py_XDECREF(exc);
4952 Py_XDECREF(errorHandler);
4953 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004955 onError:
4956 Py_XDECREF(res);
4957 Py_XDECREF(exc);
4958 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959 return NULL;
4960}
4961
4962PyObject *PyUnicode_Translate(PyObject *str,
4963 PyObject *mapping,
4964 const char *errors)
4965{
4966 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004967
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968 str = PyUnicode_FromObject(str);
4969 if (str == NULL)
4970 goto onError;
4971 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4972 PyUnicode_GET_SIZE(str),
4973 mapping,
4974 errors);
4975 Py_DECREF(str);
4976 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004977
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978 onError:
4979 Py_XDECREF(str);
4980 return NULL;
4981}
Tim Petersced69f82003-09-16 20:30:58 +00004982
Guido van Rossum9e896b32000-04-05 20:11:21 +00004983/* --- Decimal Encoder ---------------------------------------------------- */
4984
4985int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004986 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004987 char *output,
4988 const char *errors)
4989{
4990 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004991 PyObject *errorHandler = NULL;
4992 PyObject *exc = NULL;
4993 const char *encoding = "decimal";
4994 const char *reason = "invalid decimal Unicode string";
4995 /* the following variable is used for caching string comparisons
4996 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4997 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004998
4999 if (output == NULL) {
5000 PyErr_BadArgument();
5001 return -1;
5002 }
5003
5004 p = s;
5005 end = s + length;
5006 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005007 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005008 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005009 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005010 Py_ssize_t repsize;
5011 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005012 Py_UNICODE *uni2;
5013 Py_UNICODE *collstart;
5014 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005015
Guido van Rossum9e896b32000-04-05 20:11:21 +00005016 if (Py_UNICODE_ISSPACE(ch)) {
5017 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005018 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005019 continue;
5020 }
5021 decimal = Py_UNICODE_TODECIMAL(ch);
5022 if (decimal >= 0) {
5023 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005024 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005025 continue;
5026 }
Guido van Rossumba477042000-04-06 18:18:10 +00005027 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005028 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005029 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005030 continue;
5031 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005032 /* All other characters are considered unencodable */
5033 collstart = p;
5034 collend = p+1;
5035 while (collend < end) {
5036 if ((0 < *collend && *collend < 256) ||
5037 !Py_UNICODE_ISSPACE(*collend) ||
5038 Py_UNICODE_TODECIMAL(*collend))
5039 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005040 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005041 /* cache callback name lookup
5042 * (if not done yet, i.e. it's the first error) */
5043 if (known_errorHandler==-1) {
5044 if ((errors==NULL) || (!strcmp(errors, "strict")))
5045 known_errorHandler = 1;
5046 else if (!strcmp(errors, "replace"))
5047 known_errorHandler = 2;
5048 else if (!strcmp(errors, "ignore"))
5049 known_errorHandler = 3;
5050 else if (!strcmp(errors, "xmlcharrefreplace"))
5051 known_errorHandler = 4;
5052 else
5053 known_errorHandler = 0;
5054 }
5055 switch (known_errorHandler) {
5056 case 1: /* strict */
5057 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5058 goto onError;
5059 case 2: /* replace */
5060 for (p = collstart; p < collend; ++p)
5061 *output++ = '?';
5062 /* fall through */
5063 case 3: /* ignore */
5064 p = collend;
5065 break;
5066 case 4: /* xmlcharrefreplace */
5067 /* generate replacement (temporarily (mis)uses p) */
5068 for (p = collstart; p < collend; ++p)
5069 output += sprintf(output, "&#%d;", (int)*p);
5070 p = collend;
5071 break;
5072 default:
5073 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5074 encoding, reason, s, length, &exc,
5075 collstart-s, collend-s, &newpos);
5076 if (repunicode == NULL)
5077 goto onError;
5078 /* generate replacement */
5079 repsize = PyUnicode_GET_SIZE(repunicode);
5080 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5081 Py_UNICODE ch = *uni2;
5082 if (Py_UNICODE_ISSPACE(ch))
5083 *output++ = ' ';
5084 else {
5085 decimal = Py_UNICODE_TODECIMAL(ch);
5086 if (decimal >= 0)
5087 *output++ = '0' + decimal;
5088 else if (0 < ch && ch < 256)
5089 *output++ = (char)ch;
5090 else {
5091 Py_DECREF(repunicode);
5092 raise_encode_exception(&exc, encoding,
5093 s, length, collstart-s, collend-s, reason);
5094 goto onError;
5095 }
5096 }
5097 }
5098 p = s + newpos;
5099 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005100 }
5101 }
5102 /* 0-terminate the output string */
5103 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005104 Py_XDECREF(exc);
5105 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005106 return 0;
5107
5108 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005109 Py_XDECREF(exc);
5110 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005111 return -1;
5112}
5113
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114/* --- Helpers ------------------------------------------------------------ */
5115
Eric Smith8c663262007-08-25 02:26:07 +00005116#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005117
5118#include "stringlib/fastsearch.h"
5119
5120#include "stringlib/count.h"
5121#include "stringlib/find.h"
5122#include "stringlib/partition.h"
5123
5124/* helper macro to fixup start/end slice values */
5125#define FIX_START_END(obj) \
5126 if (start < 0) \
5127 start += (obj)->length; \
5128 if (start < 0) \
5129 start = 0; \
5130 if (end > (obj)->length) \
5131 end = (obj)->length; \
5132 if (end < 0) \
5133 end += (obj)->length; \
5134 if (end < 0) \
5135 end = 0;
5136
Martin v. Löwis18e16552006-02-15 17:27:45 +00005137Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005138 PyObject *substr,
5139 Py_ssize_t start,
5140 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005142 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005143 PyUnicodeObject* str_obj;
5144 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005145
Thomas Wouters477c8d52006-05-27 19:21:47 +00005146 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5147 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005149 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5150 if (!sub_obj) {
5151 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 return -1;
5153 }
Tim Petersced69f82003-09-16 20:30:58 +00005154
Thomas Wouters477c8d52006-05-27 19:21:47 +00005155 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005156
Thomas Wouters477c8d52006-05-27 19:21:47 +00005157 result = stringlib_count(
5158 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5159 );
5160
5161 Py_DECREF(sub_obj);
5162 Py_DECREF(str_obj);
5163
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 return result;
5165}
5166
Martin v. Löwis18e16552006-02-15 17:27:45 +00005167Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005168 PyObject *sub,
5169 Py_ssize_t start,
5170 Py_ssize_t end,
5171 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005173 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005174
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005176 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005177 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005178 sub = PyUnicode_FromObject(sub);
5179 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005180 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005181 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 }
Tim Petersced69f82003-09-16 20:30:58 +00005183
Thomas Wouters477c8d52006-05-27 19:21:47 +00005184 if (direction > 0)
5185 result = stringlib_find_slice(
5186 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5187 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5188 start, end
5189 );
5190 else
5191 result = stringlib_rfind_slice(
5192 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5193 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5194 start, end
5195 );
5196
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005198 Py_DECREF(sub);
5199
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 return result;
5201}
5202
Tim Petersced69f82003-09-16 20:30:58 +00005203static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204int tailmatch(PyUnicodeObject *self,
5205 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005206 Py_ssize_t start,
5207 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 int direction)
5209{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 if (substring->length == 0)
5211 return 1;
5212
Thomas Wouters477c8d52006-05-27 19:21:47 +00005213 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214
5215 end -= substring->length;
5216 if (end < start)
5217 return 0;
5218
5219 if (direction > 0) {
5220 if (Py_UNICODE_MATCH(self, end, substring))
5221 return 1;
5222 } else {
5223 if (Py_UNICODE_MATCH(self, start, substring))
5224 return 1;
5225 }
5226
5227 return 0;
5228}
5229
Martin v. Löwis18e16552006-02-15 17:27:45 +00005230Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005232 Py_ssize_t start,
5233 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 int direction)
5235{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005236 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005237
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238 str = PyUnicode_FromObject(str);
5239 if (str == NULL)
5240 return -1;
5241 substr = PyUnicode_FromObject(substr);
5242 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005243 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 return -1;
5245 }
Tim Petersced69f82003-09-16 20:30:58 +00005246
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 result = tailmatch((PyUnicodeObject *)str,
5248 (PyUnicodeObject *)substr,
5249 start, end, direction);
5250 Py_DECREF(str);
5251 Py_DECREF(substr);
5252 return result;
5253}
5254
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255/* Apply fixfct filter to the Unicode object self and return a
5256 reference to the modified object */
5257
Tim Petersced69f82003-09-16 20:30:58 +00005258static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259PyObject *fixup(PyUnicodeObject *self,
5260 int (*fixfct)(PyUnicodeObject *s))
5261{
5262
5263 PyUnicodeObject *u;
5264
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005265 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 if (u == NULL)
5267 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005268
5269 Py_UNICODE_COPY(u->str, self->str, self->length);
5270
Tim Peters7a29bd52001-09-12 03:03:31 +00005271 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272 /* fixfct should return TRUE if it modified the buffer. If
5273 FALSE, return a reference to the original buffer instead
5274 (to save space, not time) */
5275 Py_INCREF(self);
5276 Py_DECREF(u);
5277 return (PyObject*) self;
5278 }
5279 return (PyObject*) u;
5280}
5281
Tim Petersced69f82003-09-16 20:30:58 +00005282static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283int fixupper(PyUnicodeObject *self)
5284{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005285 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 Py_UNICODE *s = self->str;
5287 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005288
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 while (len-- > 0) {
5290 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005291
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 ch = Py_UNICODE_TOUPPER(*s);
5293 if (ch != *s) {
5294 status = 1;
5295 *s = ch;
5296 }
5297 s++;
5298 }
5299
5300 return status;
5301}
5302
Tim Petersced69f82003-09-16 20:30:58 +00005303static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304int fixlower(PyUnicodeObject *self)
5305{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005306 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 Py_UNICODE *s = self->str;
5308 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005309
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310 while (len-- > 0) {
5311 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005312
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 ch = Py_UNICODE_TOLOWER(*s);
5314 if (ch != *s) {
5315 status = 1;
5316 *s = ch;
5317 }
5318 s++;
5319 }
5320
5321 return status;
5322}
5323
Tim Petersced69f82003-09-16 20:30:58 +00005324static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325int fixswapcase(PyUnicodeObject *self)
5326{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005327 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 Py_UNICODE *s = self->str;
5329 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005330
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 while (len-- > 0) {
5332 if (Py_UNICODE_ISUPPER(*s)) {
5333 *s = Py_UNICODE_TOLOWER(*s);
5334 status = 1;
5335 } else if (Py_UNICODE_ISLOWER(*s)) {
5336 *s = Py_UNICODE_TOUPPER(*s);
5337 status = 1;
5338 }
5339 s++;
5340 }
5341
5342 return status;
5343}
5344
Tim Petersced69f82003-09-16 20:30:58 +00005345static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346int fixcapitalize(PyUnicodeObject *self)
5347{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005348 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005349 Py_UNICODE *s = self->str;
5350 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005351
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005352 if (len == 0)
5353 return 0;
5354 if (Py_UNICODE_ISLOWER(*s)) {
5355 *s = Py_UNICODE_TOUPPER(*s);
5356 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005358 s++;
5359 while (--len > 0) {
5360 if (Py_UNICODE_ISUPPER(*s)) {
5361 *s = Py_UNICODE_TOLOWER(*s);
5362 status = 1;
5363 }
5364 s++;
5365 }
5366 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367}
5368
5369static
5370int fixtitle(PyUnicodeObject *self)
5371{
5372 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5373 register Py_UNICODE *e;
5374 int previous_is_cased;
5375
5376 /* Shortcut for single character strings */
5377 if (PyUnicode_GET_SIZE(self) == 1) {
5378 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5379 if (*p != ch) {
5380 *p = ch;
5381 return 1;
5382 }
5383 else
5384 return 0;
5385 }
Tim Petersced69f82003-09-16 20:30:58 +00005386
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 e = p + PyUnicode_GET_SIZE(self);
5388 previous_is_cased = 0;
5389 for (; p < e; p++) {
5390 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005391
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 if (previous_is_cased)
5393 *p = Py_UNICODE_TOLOWER(ch);
5394 else
5395 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005396
5397 if (Py_UNICODE_ISLOWER(ch) ||
5398 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 Py_UNICODE_ISTITLE(ch))
5400 previous_is_cased = 1;
5401 else
5402 previous_is_cased = 0;
5403 }
5404 return 1;
5405}
5406
Tim Peters8ce9f162004-08-27 01:49:32 +00005407PyObject *
5408PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409{
Tim Peters8ce9f162004-08-27 01:49:32 +00005410 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005411 const Py_UNICODE blank = ' ';
5412 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005413 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005414 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005415 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5416 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005417 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5418 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005419 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005420 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005421 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422
Tim Peters05eba1f2004-08-27 21:32:02 +00005423 fseq = PySequence_Fast(seq, "");
5424 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005425 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005426 }
5427
Tim Peters91879ab2004-08-27 22:35:44 +00005428 /* Grrrr. A codec may be invoked to convert str objects to
5429 * Unicode, and so it's possible to call back into Python code
5430 * during PyUnicode_FromObject(), and so it's possible for a sick
5431 * codec to change the size of fseq (if seq is a list). Therefore
5432 * we have to keep refetching the size -- can't assume seqlen
5433 * is invariant.
5434 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005435 seqlen = PySequence_Fast_GET_SIZE(fseq);
5436 /* If empty sequence, return u"". */
5437 if (seqlen == 0) {
5438 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5439 goto Done;
5440 }
5441 /* If singleton sequence with an exact Unicode, return that. */
5442 if (seqlen == 1) {
5443 item = PySequence_Fast_GET_ITEM(fseq, 0);
5444 if (PyUnicode_CheckExact(item)) {
5445 Py_INCREF(item);
5446 res = (PyUnicodeObject *)item;
5447 goto Done;
5448 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005449 }
5450
Tim Peters05eba1f2004-08-27 21:32:02 +00005451 /* At least two items to join, or one that isn't exact Unicode. */
5452 if (seqlen > 1) {
5453 /* Set up sep and seplen -- they're needed. */
5454 if (separator == NULL) {
5455 sep = &blank;
5456 seplen = 1;
5457 }
5458 else {
5459 internal_separator = PyUnicode_FromObject(separator);
5460 if (internal_separator == NULL)
5461 goto onError;
5462 sep = PyUnicode_AS_UNICODE(internal_separator);
5463 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005464 /* In case PyUnicode_FromObject() mutated seq. */
5465 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005466 }
5467 }
5468
5469 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005470 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005471 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005472 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005473 res_p = PyUnicode_AS_UNICODE(res);
5474 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005475
Tim Peters05eba1f2004-08-27 21:32:02 +00005476 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005477 Py_ssize_t itemlen;
5478 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005479
5480 item = PySequence_Fast_GET_ITEM(fseq, i);
5481 /* Convert item to Unicode. */
Guido van Rossumf1044292007-09-27 18:01:22 +00005482 if (!PyString_Check(item) && !PyUnicode_Check(item))
5483 {
5484 if (PyBytes_Check(item))
5485 {
5486 PyErr_Format(PyExc_TypeError,
5487 "sequence item %d: join() will not operate on "
5488 "bytes objects", i);
5489 goto onError;
5490 }
5491 item = PyObject_Unicode(item);
Tim Peters8ce9f162004-08-27 01:49:32 +00005492 }
Guido van Rossumf1044292007-09-27 18:01:22 +00005493 else
5494 item = PyUnicode_FromObject(item);
5495
Tim Peters05eba1f2004-08-27 21:32:02 +00005496 if (item == NULL)
5497 goto onError;
5498 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005499
Tim Peters91879ab2004-08-27 22:35:44 +00005500 /* In case PyUnicode_FromObject() mutated seq. */
5501 seqlen = PySequence_Fast_GET_SIZE(fseq);
5502
Tim Peters8ce9f162004-08-27 01:49:32 +00005503 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005505 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005506 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005507 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005508 if (i < seqlen - 1) {
5509 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005510 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005511 goto Overflow;
5512 }
5513 if (new_res_used > res_alloc) {
5514 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005515 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005516 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005517 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005518 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005519 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005520 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005521 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005523 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005524 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005526
5527 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005528 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005529 res_p += itemlen;
5530 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005531 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005532 res_p += seplen;
5533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005535 res_used = new_res_used;
5536 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005537
Tim Peters05eba1f2004-08-27 21:32:02 +00005538 /* Shrink res to match the used area; this probably can't fail,
5539 * but it's cheap to check.
5540 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005541 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005542 goto onError;
5543
5544 Done:
5545 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005546 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 return (PyObject *)res;
5548
Tim Peters8ce9f162004-08-27 01:49:32 +00005549 Overflow:
5550 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005551 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005552 Py_DECREF(item);
5553 /* fall through */
5554
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005556 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005557 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005558 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 return NULL;
5560}
5561
Tim Petersced69f82003-09-16 20:30:58 +00005562static
5563PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005564 Py_ssize_t left,
5565 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 Py_UNICODE fill)
5567{
5568 PyUnicodeObject *u;
5569
5570 if (left < 0)
5571 left = 0;
5572 if (right < 0)
5573 right = 0;
5574
Tim Peters7a29bd52001-09-12 03:03:31 +00005575 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 Py_INCREF(self);
5577 return self;
5578 }
5579
5580 u = _PyUnicode_New(left + self->length + right);
5581 if (u) {
5582 if (left)
5583 Py_UNICODE_FILL(u->str, fill, left);
5584 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5585 if (right)
5586 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5587 }
5588
5589 return u;
5590}
5591
5592#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005593 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 if (!str) \
5595 goto onError; \
5596 if (PyList_Append(list, str)) { \
5597 Py_DECREF(str); \
5598 goto onError; \
5599 } \
5600 else \
5601 Py_DECREF(str);
5602
5603static
5604PyObject *split_whitespace(PyUnicodeObject *self,
5605 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005606 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005608 register Py_ssize_t i;
5609 register Py_ssize_t j;
5610 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 PyObject *str;
5612
5613 for (i = j = 0; i < len; ) {
5614 /* find a token */
5615 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5616 i++;
5617 j = i;
5618 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5619 i++;
5620 if (j < i) {
5621 if (maxcount-- <= 0)
5622 break;
5623 SPLIT_APPEND(self->str, j, i);
5624 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5625 i++;
5626 j = i;
5627 }
5628 }
5629 if (j < len) {
5630 SPLIT_APPEND(self->str, j, len);
5631 }
5632 return list;
5633
5634 onError:
5635 Py_DECREF(list);
5636 return NULL;
5637}
5638
5639PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005640 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005642 register Py_ssize_t i;
5643 register Py_ssize_t j;
5644 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 PyObject *list;
5646 PyObject *str;
5647 Py_UNICODE *data;
5648
5649 string = PyUnicode_FromObject(string);
5650 if (string == NULL)
5651 return NULL;
5652 data = PyUnicode_AS_UNICODE(string);
5653 len = PyUnicode_GET_SIZE(string);
5654
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 list = PyList_New(0);
5656 if (!list)
5657 goto onError;
5658
5659 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005660 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005661
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005663 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665
5666 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005667 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 if (i < len) {
5669 if (data[i] == '\r' && i + 1 < len &&
5670 data[i+1] == '\n')
5671 i += 2;
5672 else
5673 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005674 if (keepends)
5675 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 }
Guido van Rossum86662912000-04-11 15:38:46 +00005677 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 j = i;
5679 }
5680 if (j < len) {
5681 SPLIT_APPEND(data, j, len);
5682 }
5683
5684 Py_DECREF(string);
5685 return list;
5686
5687 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005688 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 Py_DECREF(string);
5690 return NULL;
5691}
5692
Tim Petersced69f82003-09-16 20:30:58 +00005693static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694PyObject *split_char(PyUnicodeObject *self,
5695 PyObject *list,
5696 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005697 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005699 register Py_ssize_t i;
5700 register Py_ssize_t j;
5701 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 PyObject *str;
5703
5704 for (i = j = 0; i < len; ) {
5705 if (self->str[i] == ch) {
5706 if (maxcount-- <= 0)
5707 break;
5708 SPLIT_APPEND(self->str, j, i);
5709 i = j = i + 1;
5710 } else
5711 i++;
5712 }
5713 if (j <= len) {
5714 SPLIT_APPEND(self->str, j, len);
5715 }
5716 return list;
5717
5718 onError:
5719 Py_DECREF(list);
5720 return NULL;
5721}
5722
Tim Petersced69f82003-09-16 20:30:58 +00005723static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724PyObject *split_substring(PyUnicodeObject *self,
5725 PyObject *list,
5726 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005727 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005729 register Py_ssize_t i;
5730 register Py_ssize_t j;
5731 Py_ssize_t len = self->length;
5732 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 PyObject *str;
5734
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005735 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 if (Py_UNICODE_MATCH(self, i, substring)) {
5737 if (maxcount-- <= 0)
5738 break;
5739 SPLIT_APPEND(self->str, j, i);
5740 i = j = i + sublen;
5741 } else
5742 i++;
5743 }
5744 if (j <= len) {
5745 SPLIT_APPEND(self->str, j, len);
5746 }
5747 return list;
5748
5749 onError:
5750 Py_DECREF(list);
5751 return NULL;
5752}
5753
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005754static
5755PyObject *rsplit_whitespace(PyUnicodeObject *self,
5756 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005757 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005758{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005759 register Py_ssize_t i;
5760 register Py_ssize_t j;
5761 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005762 PyObject *str;
5763
5764 for (i = j = len - 1; i >= 0; ) {
5765 /* find a token */
5766 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5767 i--;
5768 j = i;
5769 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5770 i--;
5771 if (j > i) {
5772 if (maxcount-- <= 0)
5773 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005774 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005775 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5776 i--;
5777 j = i;
5778 }
5779 }
5780 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005781 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005782 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005783 if (PyList_Reverse(list) < 0)
5784 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005785 return list;
5786
5787 onError:
5788 Py_DECREF(list);
5789 return NULL;
5790}
5791
5792static
5793PyObject *rsplit_char(PyUnicodeObject *self,
5794 PyObject *list,
5795 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005796 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005797{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005798 register Py_ssize_t i;
5799 register Py_ssize_t j;
5800 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005801 PyObject *str;
5802
5803 for (i = j = len - 1; i >= 0; ) {
5804 if (self->str[i] == ch) {
5805 if (maxcount-- <= 0)
5806 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005807 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005808 j = i = i - 1;
5809 } else
5810 i--;
5811 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005812 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005813 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005814 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005815 if (PyList_Reverse(list) < 0)
5816 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005817 return list;
5818
5819 onError:
5820 Py_DECREF(list);
5821 return NULL;
5822}
5823
5824static
5825PyObject *rsplit_substring(PyUnicodeObject *self,
5826 PyObject *list,
5827 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005828 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005829{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005830 register Py_ssize_t i;
5831 register Py_ssize_t j;
5832 Py_ssize_t len = self->length;
5833 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005834 PyObject *str;
5835
5836 for (i = len - sublen, j = len; i >= 0; ) {
5837 if (Py_UNICODE_MATCH(self, i, substring)) {
5838 if (maxcount-- <= 0)
5839 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005840 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005841 j = i;
5842 i -= sublen;
5843 } else
5844 i--;
5845 }
5846 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005847 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005848 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005849 if (PyList_Reverse(list) < 0)
5850 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005851 return list;
5852
5853 onError:
5854 Py_DECREF(list);
5855 return NULL;
5856}
5857
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858#undef SPLIT_APPEND
5859
5860static
5861PyObject *split(PyUnicodeObject *self,
5862 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005863 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864{
5865 PyObject *list;
5866
5867 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005868 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
5870 list = PyList_New(0);
5871 if (!list)
5872 return NULL;
5873
5874 if (substring == NULL)
5875 return split_whitespace(self,list,maxcount);
5876
5877 else if (substring->length == 1)
5878 return split_char(self,list,substring->str[0],maxcount);
5879
5880 else if (substring->length == 0) {
5881 Py_DECREF(list);
5882 PyErr_SetString(PyExc_ValueError, "empty separator");
5883 return NULL;
5884 }
5885 else
5886 return split_substring(self,list,substring,maxcount);
5887}
5888
Tim Petersced69f82003-09-16 20:30:58 +00005889static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005890PyObject *rsplit(PyUnicodeObject *self,
5891 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005892 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005893{
5894 PyObject *list;
5895
5896 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005897 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005898
5899 list = PyList_New(0);
5900 if (!list)
5901 return NULL;
5902
5903 if (substring == NULL)
5904 return rsplit_whitespace(self,list,maxcount);
5905
5906 else if (substring->length == 1)
5907 return rsplit_char(self,list,substring->str[0],maxcount);
5908
5909 else if (substring->length == 0) {
5910 Py_DECREF(list);
5911 PyErr_SetString(PyExc_ValueError, "empty separator");
5912 return NULL;
5913 }
5914 else
5915 return rsplit_substring(self,list,substring,maxcount);
5916}
5917
5918static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919PyObject *replace(PyUnicodeObject *self,
5920 PyUnicodeObject *str1,
5921 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005922 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923{
5924 PyUnicodeObject *u;
5925
5926 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005927 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928
Thomas Wouters477c8d52006-05-27 19:21:47 +00005929 if (str1->length == str2->length) {
5930 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005931 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005932 if (str1->length == 1) {
5933 /* replace characters */
5934 Py_UNICODE u1, u2;
5935 if (!findchar(self->str, self->length, str1->str[0]))
5936 goto nothing;
5937 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5938 if (!u)
5939 return NULL;
5940 Py_UNICODE_COPY(u->str, self->str, self->length);
5941 u1 = str1->str[0];
5942 u2 = str2->str[0];
5943 for (i = 0; i < u->length; i++)
5944 if (u->str[i] == u1) {
5945 if (--maxcount < 0)
5946 break;
5947 u->str[i] = u2;
5948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005950 i = fastsearch(
5951 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005953 if (i < 0)
5954 goto nothing;
5955 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5956 if (!u)
5957 return NULL;
5958 Py_UNICODE_COPY(u->str, self->str, self->length);
5959 while (i <= self->length - str1->length)
5960 if (Py_UNICODE_MATCH(self, i, str1)) {
5961 if (--maxcount < 0)
5962 break;
5963 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5964 i += str1->length;
5965 } else
5966 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005969
5970 Py_ssize_t n, i, j, e;
5971 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 Py_UNICODE *p;
5973
5974 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005975 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 if (n > maxcount)
5977 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005978 if (n == 0)
5979 goto nothing;
5980 /* new_size = self->length + n * (str2->length - str1->length)); */
5981 delta = (str2->length - str1->length);
5982 if (delta == 0) {
5983 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005985 product = n * (str2->length - str1->length);
5986 if ((product / (str2->length - str1->length)) != n) {
5987 PyErr_SetString(PyExc_OverflowError,
5988 "replace string is too long");
5989 return NULL;
5990 }
5991 new_size = self->length + product;
5992 if (new_size < 0) {
5993 PyErr_SetString(PyExc_OverflowError,
5994 "replace string is too long");
5995 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 }
5997 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005998 u = _PyUnicode_New(new_size);
5999 if (!u)
6000 return NULL;
6001 i = 0;
6002 p = u->str;
6003 e = self->length - str1->length;
6004 if (str1->length > 0) {
6005 while (n-- > 0) {
6006 /* look for next match */
6007 j = i;
6008 while (j <= e) {
6009 if (Py_UNICODE_MATCH(self, j, str1))
6010 break;
6011 j++;
6012 }
6013 if (j > i) {
6014 if (j > e)
6015 break;
6016 /* copy unchanged part [i:j] */
6017 Py_UNICODE_COPY(p, self->str+i, j-i);
6018 p += j - i;
6019 }
6020 /* copy substitution string */
6021 if (str2->length > 0) {
6022 Py_UNICODE_COPY(p, str2->str, str2->length);
6023 p += str2->length;
6024 }
6025 i = j + str1->length;
6026 }
6027 if (i < self->length)
6028 /* copy tail [i:] */
6029 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6030 } else {
6031 /* interleave */
6032 while (n > 0) {
6033 Py_UNICODE_COPY(p, str2->str, str2->length);
6034 p += str2->length;
6035 if (--n <= 0)
6036 break;
6037 *p++ = self->str[i++];
6038 }
6039 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6040 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006043
6044nothing:
6045 /* nothing to replace; return original string (when possible) */
6046 if (PyUnicode_CheckExact(self)) {
6047 Py_INCREF(self);
6048 return (PyObject *) self;
6049 }
6050 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051}
6052
6053/* --- Unicode Object Methods --------------------------------------------- */
6054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006055PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056"S.title() -> unicode\n\
6057\n\
6058Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006059characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060
6061static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006062unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 return fixup(self, fixtitle);
6065}
6066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006067PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068"S.capitalize() -> unicode\n\
6069\n\
6070Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006071have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072
6073static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006074unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 return fixup(self, fixcapitalize);
6077}
6078
6079#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006080PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081"S.capwords() -> unicode\n\
6082\n\
6083Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006084normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085
6086static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006087unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088{
6089 PyObject *list;
6090 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006091 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 /* Split into words */
6094 list = split(self, NULL, -1);
6095 if (!list)
6096 return NULL;
6097
6098 /* Capitalize each word */
6099 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6100 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6101 fixcapitalize);
6102 if (item == NULL)
6103 goto onError;
6104 Py_DECREF(PyList_GET_ITEM(list, i));
6105 PyList_SET_ITEM(list, i, item);
6106 }
6107
6108 /* Join the words to form a new string */
6109 item = PyUnicode_Join(NULL, list);
6110
6111onError:
6112 Py_DECREF(list);
6113 return (PyObject *)item;
6114}
6115#endif
6116
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006117/* Argument converter. Coerces to a single unicode character */
6118
6119static int
6120convert_uc(PyObject *obj, void *addr)
6121{
6122 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6123 PyObject *uniobj;
6124 Py_UNICODE *unistr;
6125
6126 uniobj = PyUnicode_FromObject(obj);
6127 if (uniobj == NULL) {
6128 PyErr_SetString(PyExc_TypeError,
6129 "The fill character cannot be converted to Unicode");
6130 return 0;
6131 }
6132 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6133 PyErr_SetString(PyExc_TypeError,
6134 "The fill character must be exactly one character long");
6135 Py_DECREF(uniobj);
6136 return 0;
6137 }
6138 unistr = PyUnicode_AS_UNICODE(uniobj);
6139 *fillcharloc = unistr[0];
6140 Py_DECREF(uniobj);
6141 return 1;
6142}
6143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006144PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006145"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006147Return S centered in a Unicode string of length width. Padding is\n\
6148done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149
6150static PyObject *
6151unicode_center(PyUnicodeObject *self, PyObject *args)
6152{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006153 Py_ssize_t marg, left;
6154 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006155 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156
Thomas Woutersde017742006-02-16 19:34:37 +00006157 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 return NULL;
6159
Tim Peters7a29bd52001-09-12 03:03:31 +00006160 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 Py_INCREF(self);
6162 return (PyObject*) self;
6163 }
6164
6165 marg = width - self->length;
6166 left = marg / 2 + (marg & width & 1);
6167
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006168 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169}
6170
Marc-André Lemburge5034372000-08-08 08:04:29 +00006171#if 0
6172
6173/* This code should go into some future Unicode collation support
6174 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006175 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006176
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006177/* speedy UTF-16 code point order comparison */
6178/* gleaned from: */
6179/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6180
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006181static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006182{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006183 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006184 0, 0, 0, 0, 0, 0, 0, 0,
6185 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006186 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006187};
6188
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189static int
6190unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6191{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006192 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006193
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 Py_UNICODE *s1 = str1->str;
6195 Py_UNICODE *s2 = str2->str;
6196
6197 len1 = str1->length;
6198 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006199
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006201 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006202
6203 c1 = *s1++;
6204 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006205
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006206 if (c1 > (1<<11) * 26)
6207 c1 += utf16Fixup[c1>>11];
6208 if (c2 > (1<<11) * 26)
6209 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006210 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006211
6212 if (c1 != c2)
6213 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006214
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006215 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 }
6217
6218 return (len1 < len2) ? -1 : (len1 != len2);
6219}
6220
Marc-André Lemburge5034372000-08-08 08:04:29 +00006221#else
6222
6223static int
6224unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6225{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006226 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006227
6228 Py_UNICODE *s1 = str1->str;
6229 Py_UNICODE *s2 = str2->str;
6230
6231 len1 = str1->length;
6232 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006233
Marc-André Lemburge5034372000-08-08 08:04:29 +00006234 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006235 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006236
Fredrik Lundh45714e92001-06-26 16:39:36 +00006237 c1 = *s1++;
6238 c2 = *s2++;
6239
6240 if (c1 != c2)
6241 return (c1 < c2) ? -1 : 1;
6242
Marc-André Lemburge5034372000-08-08 08:04:29 +00006243 len1--; len2--;
6244 }
6245
6246 return (len1 < len2) ? -1 : (len1 != len2);
6247}
6248
6249#endif
6250
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251int PyUnicode_Compare(PyObject *left,
6252 PyObject *right)
6253{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006254 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6255 return unicode_compare((PyUnicodeObject *)left,
6256 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006257 PyErr_Format(PyExc_TypeError,
6258 "Can't compare %.100s and %.100s",
6259 left->ob_type->tp_name,
6260 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 return -1;
6262}
6263
Martin v. Löwis5b222132007-06-10 09:51:05 +00006264int
6265PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6266{
6267 int i;
6268 Py_UNICODE *id;
6269 assert(PyUnicode_Check(uni));
6270 id = PyUnicode_AS_UNICODE(uni);
6271 /* Compare Unicode string and source character set string */
6272 for (i = 0; id[i] && str[i]; i++)
6273 if (id[i] != str[i])
6274 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6275 if (id[i])
6276 return 1; /* uni is longer */
6277 if (str[i])
6278 return -1; /* str is longer */
6279 return 0;
6280}
6281
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006282PyObject *PyUnicode_RichCompare(PyObject *left,
6283 PyObject *right,
6284 int op)
6285{
6286 int result;
6287
6288 result = PyUnicode_Compare(left, right);
6289 if (result == -1 && PyErr_Occurred())
6290 goto onError;
6291
6292 /* Convert the return value to a Boolean */
6293 switch (op) {
6294 case Py_EQ:
6295 result = (result == 0);
6296 break;
6297 case Py_NE:
6298 result = (result != 0);
6299 break;
6300 case Py_LE:
6301 result = (result <= 0);
6302 break;
6303 case Py_GE:
6304 result = (result >= 0);
6305 break;
6306 case Py_LT:
6307 result = (result == -1);
6308 break;
6309 case Py_GT:
6310 result = (result == 1);
6311 break;
6312 }
6313 return PyBool_FromLong(result);
6314
6315 onError:
6316
6317 /* Standard case
6318
6319 Type errors mean that PyUnicode_FromObject() could not convert
6320 one of the arguments (usually the right hand side) to Unicode,
6321 ie. we can't handle the comparison request. However, it is
6322 possible that the other object knows a comparison method, which
6323 is why we return Py_NotImplemented to give the other object a
6324 chance.
6325
6326 */
6327 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6328 PyErr_Clear();
6329 Py_INCREF(Py_NotImplemented);
6330 return Py_NotImplemented;
6331 }
6332 if (op != Py_EQ && op != Py_NE)
6333 return NULL;
6334
6335 /* Equality comparison.
6336
6337 This is a special case: we silence any PyExc_UnicodeDecodeError
6338 and instead turn it into a PyErr_UnicodeWarning.
6339
6340 */
6341 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6342 return NULL;
6343 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006344 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6345 (op == Py_EQ) ?
6346 "Unicode equal comparison "
6347 "failed to convert both arguments to Unicode - "
6348 "interpreting them as being unequal"
6349 :
6350 "Unicode unequal comparison "
6351 "failed to convert both arguments to Unicode - "
6352 "interpreting them as being unequal",
6353 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006354 return NULL;
6355 result = (op == Py_NE);
6356 return PyBool_FromLong(result);
6357}
6358
Guido van Rossum403d68b2000-03-13 15:55:09 +00006359int PyUnicode_Contains(PyObject *container,
6360 PyObject *element)
6361{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006362 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006363 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006364
6365 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006366 sub = PyUnicode_FromObject(element);
6367 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006368 PyErr_Format(PyExc_TypeError,
6369 "'in <string>' requires string as left operand, not %s",
6370 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006371 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006372 }
6373
Thomas Wouters477c8d52006-05-27 19:21:47 +00006374 str = PyUnicode_FromObject(container);
6375 if (!str) {
6376 Py_DECREF(sub);
6377 return -1;
6378 }
6379
6380 result = stringlib_contains_obj(str, sub);
6381
6382 Py_DECREF(str);
6383 Py_DECREF(sub);
6384
Guido van Rossum403d68b2000-03-13 15:55:09 +00006385 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006386}
6387
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388/* Concat to string or Unicode object giving a new Unicode object. */
6389
6390PyObject *PyUnicode_Concat(PyObject *left,
6391 PyObject *right)
6392{
6393 PyUnicodeObject *u = NULL, *v = NULL, *w;
6394
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006395 if (PyBytes_Check(left) || PyBytes_Check(right))
6396 return PyBytes_Concat(left, right);
6397
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 /* Coerce the two arguments */
6399 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6400 if (u == NULL)
6401 goto onError;
6402 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6403 if (v == NULL)
6404 goto onError;
6405
6406 /* Shortcuts */
6407 if (v == unicode_empty) {
6408 Py_DECREF(v);
6409 return (PyObject *)u;
6410 }
6411 if (u == unicode_empty) {
6412 Py_DECREF(u);
6413 return (PyObject *)v;
6414 }
6415
6416 /* Concat the two Unicode strings */
6417 w = _PyUnicode_New(u->length + v->length);
6418 if (w == NULL)
6419 goto onError;
6420 Py_UNICODE_COPY(w->str, u->str, u->length);
6421 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6422
6423 Py_DECREF(u);
6424 Py_DECREF(v);
6425 return (PyObject *)w;
6426
6427onError:
6428 Py_XDECREF(u);
6429 Py_XDECREF(v);
6430 return NULL;
6431}
6432
Walter Dörwald1ab83302007-05-18 17:15:44 +00006433void
6434PyUnicode_Append(PyObject **pleft, PyObject *right)
6435{
6436 PyObject *new;
6437 if (*pleft == NULL)
6438 return;
6439 if (right == NULL || !PyUnicode_Check(*pleft)) {
6440 Py_DECREF(*pleft);
6441 *pleft = NULL;
6442 return;
6443 }
6444 new = PyUnicode_Concat(*pleft, right);
6445 Py_DECREF(*pleft);
6446 *pleft = new;
6447}
6448
6449void
6450PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6451{
6452 PyUnicode_Append(pleft, right);
6453 Py_XDECREF(right);
6454}
6455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006456PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457"S.count(sub[, start[, end]]) -> int\n\
6458\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006459Return the number of non-overlapping occurrences of substring sub in\n\
6460Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006461interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462
6463static PyObject *
6464unicode_count(PyUnicodeObject *self, PyObject *args)
6465{
6466 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006467 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006468 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 PyObject *result;
6470
Guido van Rossumb8872e62000-05-09 14:14:27 +00006471 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6472 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 return NULL;
6474
6475 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006476 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 if (substring == NULL)
6478 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006479
Thomas Wouters477c8d52006-05-27 19:21:47 +00006480 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481
Thomas Wouters477c8d52006-05-27 19:21:47 +00006482 result = PyInt_FromSsize_t(
6483 stringlib_count(self->str + start, end - start,
6484 substring->str, substring->length)
6485 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486
6487 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006488
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 return result;
6490}
6491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006492PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006493"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006495Encodes S using the codec registered for encoding. encoding defaults\n\
6496to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006497handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006498a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6499'xmlcharrefreplace' as well as any other name registered with\n\
6500codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501
6502static PyObject *
6503unicode_encode(PyUnicodeObject *self, PyObject *args)
6504{
6505 char *encoding = NULL;
6506 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006507 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006508
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6510 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006511 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006512 if (v == NULL)
6513 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006514 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006515 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006516 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006517 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006518 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006519 Py_DECREF(v);
6520 return NULL;
6521 }
6522 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006523
6524 onError:
6525 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006526}
6527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006528PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529"S.expandtabs([tabsize]) -> unicode\n\
6530\n\
6531Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006532If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533
6534static PyObject*
6535unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6536{
6537 Py_UNICODE *e;
6538 Py_UNICODE *p;
6539 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006540 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 PyUnicodeObject *u;
6542 int tabsize = 8;
6543
6544 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6545 return NULL;
6546
Thomas Wouters7e474022000-07-16 12:04:32 +00006547 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006548 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 e = self->str + self->length;
6550 for (p = self->str; p < e; p++)
6551 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006552 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006554 if (old_j > j) {
6555 PyErr_SetString(PyExc_OverflowError,
6556 "new string is too long");
6557 return NULL;
6558 }
6559 old_j = j;
6560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 }
6562 else {
6563 j++;
6564 if (*p == '\n' || *p == '\r') {
6565 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006566 old_j = j = 0;
6567 if (i < 0) {
6568 PyErr_SetString(PyExc_OverflowError,
6569 "new string is too long");
6570 return NULL;
6571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 }
6573 }
6574
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006575 if ((i + j) < 0) {
6576 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6577 return NULL;
6578 }
6579
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 /* Second pass: create output string and fill it */
6581 u = _PyUnicode_New(i + j);
6582 if (!u)
6583 return NULL;
6584
6585 j = 0;
6586 q = u->str;
6587
6588 for (p = self->str; p < e; p++)
6589 if (*p == '\t') {
6590 if (tabsize > 0) {
6591 i = tabsize - (j % tabsize);
6592 j += i;
6593 while (i--)
6594 *q++ = ' ';
6595 }
6596 }
6597 else {
6598 j++;
6599 *q++ = *p;
6600 if (*p == '\n' || *p == '\r')
6601 j = 0;
6602 }
6603
6604 return (PyObject*) u;
6605}
6606
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006607PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608"S.find(sub [,start [,end]]) -> int\n\
6609\n\
6610Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006611such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612arguments start and end are interpreted as in slice notation.\n\
6613\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006614Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615
6616static PyObject *
6617unicode_find(PyUnicodeObject *self, PyObject *args)
6618{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006619 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006620 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006621 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006622 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623
Guido van Rossumb8872e62000-05-09 14:14:27 +00006624 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6625 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006627 substring = PyUnicode_FromObject(substring);
6628 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 return NULL;
6630
Thomas Wouters477c8d52006-05-27 19:21:47 +00006631 result = stringlib_find_slice(
6632 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6633 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6634 start, end
6635 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636
6637 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006638
6639 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640}
6641
6642static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006643unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644{
6645 if (index < 0 || index >= self->length) {
6646 PyErr_SetString(PyExc_IndexError, "string index out of range");
6647 return NULL;
6648 }
6649
6650 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6651}
6652
Guido van Rossumc2504932007-09-18 19:42:40 +00006653/* Believe it or not, this produces the same value for ASCII strings
6654 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006656unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657{
Guido van Rossumc2504932007-09-18 19:42:40 +00006658 Py_ssize_t len;
6659 Py_UNICODE *p;
6660 long x;
6661
6662 if (self->hash != -1)
6663 return self->hash;
6664 len = Py_Size(self);
6665 p = self->str;
6666 x = *p << 7;
6667 while (--len >= 0)
6668 x = (1000003*x) ^ *p++;
6669 x ^= Py_Size(self);
6670 if (x == -1)
6671 x = -2;
6672 self->hash = x;
6673 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674}
6675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006676PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677"S.index(sub [,start [,end]]) -> int\n\
6678\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006679Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680
6681static PyObject *
6682unicode_index(PyUnicodeObject *self, PyObject *args)
6683{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006684 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006685 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006686 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006687 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
Guido van Rossumb8872e62000-05-09 14:14:27 +00006689 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6690 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006692 substring = PyUnicode_FromObject(substring);
6693 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 return NULL;
6695
Thomas Wouters477c8d52006-05-27 19:21:47 +00006696 result = stringlib_find_slice(
6697 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6698 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6699 start, end
6700 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701
6702 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 if (result < 0) {
6705 PyErr_SetString(PyExc_ValueError, "substring not found");
6706 return NULL;
6707 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006708
Martin v. Löwis18e16552006-02-15 17:27:45 +00006709 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710}
6711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006712PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006713"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006715Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006716at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717
6718static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006719unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720{
6721 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6722 register const Py_UNICODE *e;
6723 int cased;
6724
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 /* Shortcut for single character strings */
6726 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006727 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006729 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006730 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006731 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006732
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 e = p + PyUnicode_GET_SIZE(self);
6734 cased = 0;
6735 for (; p < e; p++) {
6736 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006737
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006739 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 else if (!cased && Py_UNICODE_ISLOWER(ch))
6741 cased = 1;
6742 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006743 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744}
6745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006746PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006747"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006749Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006750at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751
6752static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006753unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754{
6755 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6756 register const Py_UNICODE *e;
6757 int cased;
6758
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 /* Shortcut for single character strings */
6760 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006761 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006763 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006764 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006765 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006766
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 e = p + PyUnicode_GET_SIZE(self);
6768 cased = 0;
6769 for (; p < e; p++) {
6770 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006771
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006773 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 else if (!cased && Py_UNICODE_ISUPPER(ch))
6775 cased = 1;
6776 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006777 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778}
6779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006780PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006781"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006783Return True if S is a titlecased string and there is at least one\n\
6784character in S, i.e. upper- and titlecase characters may only\n\
6785follow uncased characters and lowercase characters only cased ones.\n\
6786Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787
6788static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006789unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790{
6791 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6792 register const Py_UNICODE *e;
6793 int cased, previous_is_cased;
6794
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 /* Shortcut for single character strings */
6796 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006797 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6798 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006800 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006801 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006802 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006803
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804 e = p + PyUnicode_GET_SIZE(self);
6805 cased = 0;
6806 previous_is_cased = 0;
6807 for (; p < e; p++) {
6808 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006809
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6811 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006812 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 previous_is_cased = 1;
6814 cased = 1;
6815 }
6816 else if (Py_UNICODE_ISLOWER(ch)) {
6817 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006818 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 previous_is_cased = 1;
6820 cased = 1;
6821 }
6822 else
6823 previous_is_cased = 0;
6824 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006825 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826}
6827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006828PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006829"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006831Return True if all characters in S are whitespace\n\
6832and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833
6834static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006835unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836{
6837 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6838 register const Py_UNICODE *e;
6839
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 /* Shortcut for single character strings */
6841 if (PyUnicode_GET_SIZE(self) == 1 &&
6842 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006843 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006845 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006846 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006847 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006848
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 e = p + PyUnicode_GET_SIZE(self);
6850 for (; p < e; p++) {
6851 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006852 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006854 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855}
6856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006857PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006858"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006859\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006860Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006861and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006862
6863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006864unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006865{
6866 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6867 register const Py_UNICODE *e;
6868
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006869 /* Shortcut for single character strings */
6870 if (PyUnicode_GET_SIZE(self) == 1 &&
6871 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006872 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006873
6874 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006875 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006876 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006877
6878 e = p + PyUnicode_GET_SIZE(self);
6879 for (; p < e; p++) {
6880 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006881 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006882 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006883 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006884}
6885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006886PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006887"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006888\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006889Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006890and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006891
6892static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006893unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006894{
6895 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6896 register const Py_UNICODE *e;
6897
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006898 /* Shortcut for single character strings */
6899 if (PyUnicode_GET_SIZE(self) == 1 &&
6900 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006901 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006902
6903 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006904 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006905 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006906
6907 e = p + PyUnicode_GET_SIZE(self);
6908 for (; p < e; p++) {
6909 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006910 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006911 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006912 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006913}
6914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006915PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006916"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006918Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006919False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920
6921static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006922unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923{
6924 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6925 register const Py_UNICODE *e;
6926
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 /* Shortcut for single character strings */
6928 if (PyUnicode_GET_SIZE(self) == 1 &&
6929 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006930 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006932 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006933 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006934 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006935
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 e = p + PyUnicode_GET_SIZE(self);
6937 for (; p < e; p++) {
6938 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006939 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006941 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942}
6943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006944PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006945"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006947Return True if all characters in S are digits\n\
6948and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
6950static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006951unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952{
6953 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6954 register const Py_UNICODE *e;
6955
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 /* Shortcut for single character strings */
6957 if (PyUnicode_GET_SIZE(self) == 1 &&
6958 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006959 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006961 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006962 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006963 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006964
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 e = p + PyUnicode_GET_SIZE(self);
6966 for (; p < e; p++) {
6967 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006968 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006970 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971}
6972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006973PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006974"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006976Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006977False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978
6979static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006980unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981{
6982 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6983 register const Py_UNICODE *e;
6984
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 /* Shortcut for single character strings */
6986 if (PyUnicode_GET_SIZE(self) == 1 &&
6987 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006988 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006990 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006991 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006992 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006993
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994 e = p + PyUnicode_GET_SIZE(self);
6995 for (; p < e; p++) {
6996 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006997 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006999 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000}
7001
Martin v. Löwis47383402007-08-15 07:32:56 +00007002int
7003PyUnicode_IsIdentifier(PyObject *self)
7004{
7005 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7006 register const Py_UNICODE *e;
7007
7008 /* Special case for empty strings */
7009 if (PyUnicode_GET_SIZE(self) == 0)
7010 return 0;
7011
7012 /* PEP 3131 says that the first character must be in
7013 XID_Start and subsequent characters in XID_Continue,
7014 and for the ASCII range, the 2.x rules apply (i.e
7015 start with letters and underscore, continue with
7016 letters, digits, underscore). However, given the current
7017 definition of XID_Start and XID_Continue, it is sufficient
7018 to check just for these, except that _ must be allowed
7019 as starting an identifier. */
7020 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7021 return 0;
7022
7023 e = p + PyUnicode_GET_SIZE(self);
7024 for (p++; p < e; p++) {
7025 if (!_PyUnicode_IsXidContinue(*p))
7026 return 0;
7027 }
7028 return 1;
7029}
7030
7031PyDoc_STRVAR(isidentifier__doc__,
7032"S.isidentifier() -> bool\n\
7033\n\
7034Return True if S is a valid identifier according\n\
7035to the language definition.");
7036
7037static PyObject*
7038unicode_isidentifier(PyObject *self)
7039{
7040 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7041}
7042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007043PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044"S.join(sequence) -> unicode\n\
7045\n\
7046Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007047sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048
7049static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007050unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007052 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053}
7054
Martin v. Löwis18e16552006-02-15 17:27:45 +00007055static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056unicode_length(PyUnicodeObject *self)
7057{
7058 return self->length;
7059}
7060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007061PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007062"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063\n\
7064Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007065done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066
7067static PyObject *
7068unicode_ljust(PyUnicodeObject *self, PyObject *args)
7069{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007070 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007071 Py_UNICODE fillchar = ' ';
7072
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007073 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 return NULL;
7075
Tim Peters7a29bd52001-09-12 03:03:31 +00007076 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077 Py_INCREF(self);
7078 return (PyObject*) self;
7079 }
7080
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007081 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082}
7083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007084PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085"S.lower() -> unicode\n\
7086\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007087Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088
7089static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007090unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092 return fixup(self, fixlower);
7093}
7094
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007095#define LEFTSTRIP 0
7096#define RIGHTSTRIP 1
7097#define BOTHSTRIP 2
7098
7099/* Arrays indexed by above */
7100static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7101
7102#define STRIPNAME(i) (stripformat[i]+3)
7103
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007104/* externally visible for str.strip(unicode) */
7105PyObject *
7106_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7107{
7108 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007109 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007110 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007111 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7112 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007113
Thomas Wouters477c8d52006-05-27 19:21:47 +00007114 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7115
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007116 i = 0;
7117 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007118 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7119 i++;
7120 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007121 }
7122
7123 j = len;
7124 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007125 do {
7126 j--;
7127 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7128 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007129 }
7130
7131 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007132 Py_INCREF(self);
7133 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007134 }
7135 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007136 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007137}
7138
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139
7140static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007141do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007143 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007144 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007145
7146 i = 0;
7147 if (striptype != RIGHTSTRIP) {
7148 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7149 i++;
7150 }
7151 }
7152
7153 j = len;
7154 if (striptype != LEFTSTRIP) {
7155 do {
7156 j--;
7157 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7158 j++;
7159 }
7160
7161 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7162 Py_INCREF(self);
7163 return (PyObject*)self;
7164 }
7165 else
7166 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167}
7168
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007169
7170static PyObject *
7171do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7172{
7173 PyObject *sep = NULL;
7174
7175 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7176 return NULL;
7177
7178 if (sep != NULL && sep != Py_None) {
7179 if (PyUnicode_Check(sep))
7180 return _PyUnicode_XStrip(self, striptype, sep);
7181 else if (PyString_Check(sep)) {
7182 PyObject *res;
7183 sep = PyUnicode_FromObject(sep);
7184 if (sep==NULL)
7185 return NULL;
7186 res = _PyUnicode_XStrip(self, striptype, sep);
7187 Py_DECREF(sep);
7188 return res;
7189 }
7190 else {
7191 PyErr_Format(PyExc_TypeError,
7192 "%s arg must be None, unicode or str",
7193 STRIPNAME(striptype));
7194 return NULL;
7195 }
7196 }
7197
7198 return do_strip(self, striptype);
7199}
7200
7201
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007202PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007203"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007204\n\
7205Return a copy of the string S with leading and trailing\n\
7206whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007207If chars is given and not None, remove characters in chars instead.\n\
7208If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007209
7210static PyObject *
7211unicode_strip(PyUnicodeObject *self, PyObject *args)
7212{
7213 if (PyTuple_GET_SIZE(args) == 0)
7214 return do_strip(self, BOTHSTRIP); /* Common case */
7215 else
7216 return do_argstrip(self, BOTHSTRIP, args);
7217}
7218
7219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007220PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007221"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007222\n\
7223Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007224If chars is given and not None, remove characters in chars instead.\n\
7225If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007226
7227static PyObject *
7228unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7229{
7230 if (PyTuple_GET_SIZE(args) == 0)
7231 return do_strip(self, LEFTSTRIP); /* Common case */
7232 else
7233 return do_argstrip(self, LEFTSTRIP, args);
7234}
7235
7236
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007237PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007238"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007239\n\
7240Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007241If chars is given and not None, remove characters in chars instead.\n\
7242If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007243
7244static PyObject *
7245unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7246{
7247 if (PyTuple_GET_SIZE(args) == 0)
7248 return do_strip(self, RIGHTSTRIP); /* Common case */
7249 else
7250 return do_argstrip(self, RIGHTSTRIP, args);
7251}
7252
7253
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007255unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256{
7257 PyUnicodeObject *u;
7258 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007259 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007260 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261
7262 if (len < 0)
7263 len = 0;
7264
Tim Peters7a29bd52001-09-12 03:03:31 +00007265 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266 /* no repeat, return original string */
7267 Py_INCREF(str);
7268 return (PyObject*) str;
7269 }
Tim Peters8f422462000-09-09 06:13:41 +00007270
7271 /* ensure # of chars needed doesn't overflow int and # of bytes
7272 * needed doesn't overflow size_t
7273 */
7274 nchars = len * str->length;
7275 if (len && nchars / len != str->length) {
7276 PyErr_SetString(PyExc_OverflowError,
7277 "repeated string is too long");
7278 return NULL;
7279 }
7280 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7281 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7282 PyErr_SetString(PyExc_OverflowError,
7283 "repeated string is too long");
7284 return NULL;
7285 }
7286 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287 if (!u)
7288 return NULL;
7289
7290 p = u->str;
7291
Thomas Wouters477c8d52006-05-27 19:21:47 +00007292 if (str->length == 1 && len > 0) {
7293 Py_UNICODE_FILL(p, str->str[0], len);
7294 } else {
7295 Py_ssize_t done = 0; /* number of characters copied this far */
7296 if (done < nchars) {
7297 Py_UNICODE_COPY(p, str->str, str->length);
7298 done = str->length;
7299 }
7300 while (done < nchars) {
7301 int n = (done <= nchars-done) ? done : nchars-done;
7302 Py_UNICODE_COPY(p+done, p, n);
7303 done += n;
7304 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305 }
7306
7307 return (PyObject*) u;
7308}
7309
7310PyObject *PyUnicode_Replace(PyObject *obj,
7311 PyObject *subobj,
7312 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007313 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314{
7315 PyObject *self;
7316 PyObject *str1;
7317 PyObject *str2;
7318 PyObject *result;
7319
7320 self = PyUnicode_FromObject(obj);
7321 if (self == NULL)
7322 return NULL;
7323 str1 = PyUnicode_FromObject(subobj);
7324 if (str1 == NULL) {
7325 Py_DECREF(self);
7326 return NULL;
7327 }
7328 str2 = PyUnicode_FromObject(replobj);
7329 if (str2 == NULL) {
7330 Py_DECREF(self);
7331 Py_DECREF(str1);
7332 return NULL;
7333 }
Tim Petersced69f82003-09-16 20:30:58 +00007334 result = replace((PyUnicodeObject *)self,
7335 (PyUnicodeObject *)str1,
7336 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337 maxcount);
7338 Py_DECREF(self);
7339 Py_DECREF(str1);
7340 Py_DECREF(str2);
7341 return result;
7342}
7343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007344PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345"S.replace (old, new[, maxsplit]) -> unicode\n\
7346\n\
7347Return a copy of S with all occurrences of substring\n\
7348old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007349given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350
7351static PyObject*
7352unicode_replace(PyUnicodeObject *self, PyObject *args)
7353{
7354 PyUnicodeObject *str1;
7355 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007356 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357 PyObject *result;
7358
Martin v. Löwis18e16552006-02-15 17:27:45 +00007359 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 return NULL;
7361 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7362 if (str1 == NULL)
7363 return NULL;
7364 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007365 if (str2 == NULL) {
7366 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007368 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369
7370 result = replace(self, str1, str2, maxcount);
7371
7372 Py_DECREF(str1);
7373 Py_DECREF(str2);
7374 return result;
7375}
7376
7377static
7378PyObject *unicode_repr(PyObject *unicode)
7379{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007380 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007381 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007382 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7383 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7384
7385 /* XXX(nnorwitz): rather than over-allocating, it would be
7386 better to choose a different scheme. Perhaps scan the
7387 first N-chars of the string and allocate based on that size.
7388 */
7389 /* Initial allocation is based on the longest-possible unichr
7390 escape.
7391
7392 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7393 unichr, so in this case it's the longest unichr escape. In
7394 narrow (UTF-16) builds this is five chars per source unichr
7395 since there are two unichrs in the surrogate pair, so in narrow
7396 (UTF-16) builds it's not the longest unichr escape.
7397
7398 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7399 so in the narrow (UTF-16) build case it's the longest unichr
7400 escape.
7401 */
7402
Walter Dörwald1ab83302007-05-18 17:15:44 +00007403 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007404 2 /* quotes */
7405#ifdef Py_UNICODE_WIDE
7406 + 10*size
7407#else
7408 + 6*size
7409#endif
7410 + 1);
7411 if (repr == NULL)
7412 return NULL;
7413
Walter Dörwald1ab83302007-05-18 17:15:44 +00007414 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007415
7416 /* Add quote */
7417 *p++ = (findchar(s, size, '\'') &&
7418 !findchar(s, size, '"')) ? '"' : '\'';
7419 while (size-- > 0) {
7420 Py_UNICODE ch = *s++;
7421
7422 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007423 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007424 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007425 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007426 continue;
7427 }
7428
7429#ifdef Py_UNICODE_WIDE
7430 /* Map 21-bit characters to '\U00xxxxxx' */
7431 else if (ch >= 0x10000) {
7432 *p++ = '\\';
7433 *p++ = 'U';
7434 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7435 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7436 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7437 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7438 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7439 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7440 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7441 *p++ = hexdigits[ch & 0x0000000F];
7442 continue;
7443 }
7444#else
7445 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7446 else if (ch >= 0xD800 && ch < 0xDC00) {
7447 Py_UNICODE ch2;
7448 Py_UCS4 ucs;
7449
7450 ch2 = *s++;
7451 size--;
7452 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7453 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7454 *p++ = '\\';
7455 *p++ = 'U';
7456 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7457 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7458 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7459 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7460 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7461 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7462 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7463 *p++ = hexdigits[ucs & 0x0000000F];
7464 continue;
7465 }
7466 /* Fall through: isolated surrogates are copied as-is */
7467 s--;
7468 size++;
7469 }
7470#endif
7471
7472 /* Map 16-bit characters to '\uxxxx' */
7473 if (ch >= 256) {
7474 *p++ = '\\';
7475 *p++ = 'u';
7476 *p++ = hexdigits[(ch >> 12) & 0x000F];
7477 *p++ = hexdigits[(ch >> 8) & 0x000F];
7478 *p++ = hexdigits[(ch >> 4) & 0x000F];
7479 *p++ = hexdigits[ch & 0x000F];
7480 }
7481
7482 /* Map special whitespace to '\t', \n', '\r' */
7483 else if (ch == '\t') {
7484 *p++ = '\\';
7485 *p++ = 't';
7486 }
7487 else if (ch == '\n') {
7488 *p++ = '\\';
7489 *p++ = 'n';
7490 }
7491 else if (ch == '\r') {
7492 *p++ = '\\';
7493 *p++ = 'r';
7494 }
7495
7496 /* Map non-printable US ASCII to '\xhh' */
7497 else if (ch < ' ' || ch >= 0x7F) {
7498 *p++ = '\\';
7499 *p++ = 'x';
7500 *p++ = hexdigits[(ch >> 4) & 0x000F];
7501 *p++ = hexdigits[ch & 0x000F];
7502 }
7503
7504 /* Copy everything else as-is */
7505 else
7506 *p++ = (char) ch;
7507 }
7508 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007509 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007510
7511 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007512 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007513 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514}
7515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007516PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517"S.rfind(sub [,start [,end]]) -> int\n\
7518\n\
7519Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007520such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521arguments start and end are interpreted as in slice notation.\n\
7522\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007523Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524
7525static PyObject *
7526unicode_rfind(PyUnicodeObject *self, PyObject *args)
7527{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007528 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007529 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007530 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007531 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532
Guido van Rossumb8872e62000-05-09 14:14:27 +00007533 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7534 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007536 substring = PyUnicode_FromObject(substring);
7537 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538 return NULL;
7539
Thomas Wouters477c8d52006-05-27 19:21:47 +00007540 result = stringlib_rfind_slice(
7541 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7542 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7543 start, end
7544 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545
7546 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007547
7548 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549}
7550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007551PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552"S.rindex(sub [,start [,end]]) -> int\n\
7553\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007554Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555
7556static PyObject *
7557unicode_rindex(PyUnicodeObject *self, PyObject *args)
7558{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007559 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007560 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007561 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007562 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563
Guido van Rossumb8872e62000-05-09 14:14:27 +00007564 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7565 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007567 substring = PyUnicode_FromObject(substring);
7568 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569 return NULL;
7570
Thomas Wouters477c8d52006-05-27 19:21:47 +00007571 result = stringlib_rfind_slice(
7572 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7573 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7574 start, end
7575 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
7577 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007578
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579 if (result < 0) {
7580 PyErr_SetString(PyExc_ValueError, "substring not found");
7581 return NULL;
7582 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007583 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584}
7585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007586PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007587"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588\n\
7589Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007590done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591
7592static PyObject *
7593unicode_rjust(PyUnicodeObject *self, PyObject *args)
7594{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007595 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007596 Py_UNICODE fillchar = ' ';
7597
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007598 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599 return NULL;
7600
Tim Peters7a29bd52001-09-12 03:03:31 +00007601 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602 Py_INCREF(self);
7603 return (PyObject*) self;
7604 }
7605
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007606 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607}
7608
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609PyObject *PyUnicode_Split(PyObject *s,
7610 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007611 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612{
7613 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007614
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615 s = PyUnicode_FromObject(s);
7616 if (s == NULL)
7617 return NULL;
7618 if (sep != NULL) {
7619 sep = PyUnicode_FromObject(sep);
7620 if (sep == NULL) {
7621 Py_DECREF(s);
7622 return NULL;
7623 }
7624 }
7625
7626 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7627
7628 Py_DECREF(s);
7629 Py_XDECREF(sep);
7630 return result;
7631}
7632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007633PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634"S.split([sep [,maxsplit]]) -> list of strings\n\
7635\n\
7636Return a list of the words in S, using sep as the\n\
7637delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007638splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007639any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640
7641static PyObject*
7642unicode_split(PyUnicodeObject *self, PyObject *args)
7643{
7644 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007645 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646
Martin v. Löwis18e16552006-02-15 17:27:45 +00007647 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648 return NULL;
7649
7650 if (substring == Py_None)
7651 return split(self, NULL, maxcount);
7652 else if (PyUnicode_Check(substring))
7653 return split(self, (PyUnicodeObject *)substring, maxcount);
7654 else
7655 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7656}
7657
Thomas Wouters477c8d52006-05-27 19:21:47 +00007658PyObject *
7659PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7660{
7661 PyObject* str_obj;
7662 PyObject* sep_obj;
7663 PyObject* out;
7664
7665 str_obj = PyUnicode_FromObject(str_in);
7666 if (!str_obj)
7667 return NULL;
7668 sep_obj = PyUnicode_FromObject(sep_in);
7669 if (!sep_obj) {
7670 Py_DECREF(str_obj);
7671 return NULL;
7672 }
7673
7674 out = stringlib_partition(
7675 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7676 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7677 );
7678
7679 Py_DECREF(sep_obj);
7680 Py_DECREF(str_obj);
7681
7682 return out;
7683}
7684
7685
7686PyObject *
7687PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7688{
7689 PyObject* str_obj;
7690 PyObject* sep_obj;
7691 PyObject* out;
7692
7693 str_obj = PyUnicode_FromObject(str_in);
7694 if (!str_obj)
7695 return NULL;
7696 sep_obj = PyUnicode_FromObject(sep_in);
7697 if (!sep_obj) {
7698 Py_DECREF(str_obj);
7699 return NULL;
7700 }
7701
7702 out = stringlib_rpartition(
7703 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7704 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7705 );
7706
7707 Py_DECREF(sep_obj);
7708 Py_DECREF(str_obj);
7709
7710 return out;
7711}
7712
7713PyDoc_STRVAR(partition__doc__,
7714"S.partition(sep) -> (head, sep, tail)\n\
7715\n\
7716Searches for the separator sep in S, and returns the part before it,\n\
7717the separator itself, and the part after it. If the separator is not\n\
7718found, returns S and two empty strings.");
7719
7720static PyObject*
7721unicode_partition(PyUnicodeObject *self, PyObject *separator)
7722{
7723 return PyUnicode_Partition((PyObject *)self, separator);
7724}
7725
7726PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007727"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007728\n\
7729Searches for the separator sep in S, starting at the end of S, and returns\n\
7730the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007731separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007732
7733static PyObject*
7734unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7735{
7736 return PyUnicode_RPartition((PyObject *)self, separator);
7737}
7738
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007739PyObject *PyUnicode_RSplit(PyObject *s,
7740 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007741 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007742{
7743 PyObject *result;
7744
7745 s = PyUnicode_FromObject(s);
7746 if (s == NULL)
7747 return NULL;
7748 if (sep != NULL) {
7749 sep = PyUnicode_FromObject(sep);
7750 if (sep == NULL) {
7751 Py_DECREF(s);
7752 return NULL;
7753 }
7754 }
7755
7756 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7757
7758 Py_DECREF(s);
7759 Py_XDECREF(sep);
7760 return result;
7761}
7762
7763PyDoc_STRVAR(rsplit__doc__,
7764"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7765\n\
7766Return a list of the words in S, using sep as the\n\
7767delimiter string, starting at the end of the string and\n\
7768working to the front. If maxsplit is given, at most maxsplit\n\
7769splits are done. If sep is not specified, any whitespace string\n\
7770is a separator.");
7771
7772static PyObject*
7773unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7774{
7775 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007776 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007777
Martin v. Löwis18e16552006-02-15 17:27:45 +00007778 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007779 return NULL;
7780
7781 if (substring == Py_None)
7782 return rsplit(self, NULL, maxcount);
7783 else if (PyUnicode_Check(substring))
7784 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7785 else
7786 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7787}
7788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007789PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007790"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791\n\
7792Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007793Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007794is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795
7796static PyObject*
7797unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7798{
Guido van Rossum86662912000-04-11 15:38:46 +00007799 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800
Guido van Rossum86662912000-04-11 15:38:46 +00007801 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 return NULL;
7803
Guido van Rossum86662912000-04-11 15:38:46 +00007804 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805}
7806
7807static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007808PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809{
Walter Dörwald346737f2007-05-31 10:44:43 +00007810 if (PyUnicode_CheckExact(self)) {
7811 Py_INCREF(self);
7812 return self;
7813 } else
7814 /* Subtype -- return genuine unicode string with the same value. */
7815 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7816 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817}
7818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007819PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820"S.swapcase() -> unicode\n\
7821\n\
7822Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007823and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824
7825static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007826unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 return fixup(self, fixswapcase);
7829}
7830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007831PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832"S.translate(table) -> unicode\n\
7833\n\
7834Return a copy of the string S, where all characters have been mapped\n\
7835through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007836Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7837Unmapped characters are left untouched. Characters mapped to None\n\
7838are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839
7840static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007841unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842{
Georg Brandl94c2c752007-10-23 06:52:59 +00007843 PyObject *newtable = NULL;
7844 Py_ssize_t i = 0;
7845 PyObject *key, *value, *result;
7846
7847 if (!PyDict_Check(table)) {
7848 PyErr_SetString(PyExc_TypeError, "translate argument must be a dict");
7849 return NULL;
7850 }
7851 /* fixup the table -- allow size-1 string keys instead of only int keys */
7852 newtable = PyDict_Copy(table);
7853 if (!newtable) return NULL;
7854 while (PyDict_Next(table, &i, &key, &value)) {
7855 if (PyUnicode_Check(key)) {
7856 /* convert string keys to integer keys */
7857 PyObject *newkey;
7858 int res;
7859 if (PyUnicode_GET_SIZE(key) != 1) {
7860 PyErr_SetString(PyExc_ValueError, "string items in translate "
7861 "table must be 1 element long");
7862 goto err;
7863 }
7864 newkey = PyInt_FromLong(PyUnicode_AS_UNICODE(key)[0]);
7865 if (!newkey)
7866 goto err;
7867 res = PyDict_SetItem(newtable, newkey, value);
7868 Py_DECREF(newkey);
7869 if (res < 0)
7870 goto err;
7871 } else if (PyInt_Check(key)) {
7872 /* just keep integer keys */
7873 if (PyDict_SetItem(newtable, key, value) < 0)
7874 goto err;
7875 } else {
7876 PyErr_SetString(PyExc_TypeError, "items in translate table must be "
7877 "strings or integers");
7878 goto err;
7879 }
7880 }
7881
7882 result = PyUnicode_TranslateCharmap(self->str,
7883 self->length,
7884 newtable,
7885 "ignore");
7886 Py_DECREF(newtable);
7887 return result;
7888 err:
7889 Py_DECREF(newtable);
7890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891}
7892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007893PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894"S.upper() -> unicode\n\
7895\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007896Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897
7898static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007899unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901 return fixup(self, fixupper);
7902}
7903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007904PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905"S.zfill(width) -> unicode\n\
7906\n\
7907Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007908of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909
7910static PyObject *
7911unicode_zfill(PyUnicodeObject *self, PyObject *args)
7912{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007913 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914 PyUnicodeObject *u;
7915
Martin v. Löwis18e16552006-02-15 17:27:45 +00007916 Py_ssize_t width;
7917 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918 return NULL;
7919
7920 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007921 if (PyUnicode_CheckExact(self)) {
7922 Py_INCREF(self);
7923 return (PyObject*) self;
7924 }
7925 else
7926 return PyUnicode_FromUnicode(
7927 PyUnicode_AS_UNICODE(self),
7928 PyUnicode_GET_SIZE(self)
7929 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 }
7931
7932 fill = width - self->length;
7933
7934 u = pad(self, fill, 0, '0');
7935
Walter Dörwald068325e2002-04-15 13:36:47 +00007936 if (u == NULL)
7937 return NULL;
7938
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939 if (u->str[fill] == '+' || u->str[fill] == '-') {
7940 /* move sign to beginning of string */
7941 u->str[0] = u->str[fill];
7942 u->str[fill] = '0';
7943 }
7944
7945 return (PyObject*) u;
7946}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947
7948#if 0
7949static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007950unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952 return PyInt_FromLong(unicode_freelist_size);
7953}
7954#endif
7955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007956PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007957"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007959Return True if S starts with the specified prefix, False otherwise.\n\
7960With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007961With optional end, stop comparing S at that position.\n\
7962prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963
7964static PyObject *
7965unicode_startswith(PyUnicodeObject *self,
7966 PyObject *args)
7967{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007968 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007970 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007971 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007972 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007974 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007975 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007977 if (PyTuple_Check(subobj)) {
7978 Py_ssize_t i;
7979 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7980 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7981 PyTuple_GET_ITEM(subobj, i));
7982 if (substring == NULL)
7983 return NULL;
7984 result = tailmatch(self, substring, start, end, -1);
7985 Py_DECREF(substring);
7986 if (result) {
7987 Py_RETURN_TRUE;
7988 }
7989 }
7990 /* nothing matched */
7991 Py_RETURN_FALSE;
7992 }
7993 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007995 return NULL;
7996 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007998 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999}
8000
8001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008002PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008003"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008005Return True if S ends with the specified suffix, False otherwise.\n\
8006With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008007With optional end, stop comparing S at that position.\n\
8008suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009
8010static PyObject *
8011unicode_endswith(PyUnicodeObject *self,
8012 PyObject *args)
8013{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008014 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008016 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008017 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008018 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008020 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8021 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008023 if (PyTuple_Check(subobj)) {
8024 Py_ssize_t i;
8025 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8026 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8027 PyTuple_GET_ITEM(subobj, i));
8028 if (substring == NULL)
8029 return NULL;
8030 result = tailmatch(self, substring, start, end, +1);
8031 Py_DECREF(substring);
8032 if (result) {
8033 Py_RETURN_TRUE;
8034 }
8035 }
8036 Py_RETURN_FALSE;
8037 }
8038 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008040 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008042 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008044 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045}
8046
Eric Smith8c663262007-08-25 02:26:07 +00008047#include "stringlib/string_format.h"
8048
8049PyDoc_STRVAR(format__doc__,
8050"S.format(*args, **kwargs) -> unicode\n\
8051\n\
8052");
8053
Eric Smith8c663262007-08-25 02:26:07 +00008054PyDoc_STRVAR(p_format__doc__,
8055"S.__format__(format_spec) -> unicode\n\
8056\n\
8057");
8058
8059static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008060unicode_getnewargs(PyUnicodeObject *v)
8061{
8062 return Py_BuildValue("(u#)", v->str, v->length);
8063}
8064
8065
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066static PyMethodDef unicode_methods[] = {
8067
8068 /* Order is according to common usage: often used methods should
8069 appear first, since lookup is done sequentially. */
8070
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008071 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8072 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8073 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008074 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008075 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8076 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8077 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8078 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8079 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8080 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8081 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008082 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008083 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8084 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8085 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008086 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008087 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8088 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8089 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008090 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008091 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008092 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008093 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008094 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8095 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8096 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8097 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8098 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8099 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8100 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8101 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8102 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8103 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8104 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8105 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8106 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8107 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008108 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008109 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008110 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8111 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008112 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8113 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00008114#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008115 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116#endif
8117
8118#if 0
8119 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008120 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121#endif
8122
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008123 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 {NULL, NULL}
8125};
8126
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008127static PyObject *
8128unicode_mod(PyObject *v, PyObject *w)
8129{
8130 if (!PyUnicode_Check(v)) {
8131 Py_INCREF(Py_NotImplemented);
8132 return Py_NotImplemented;
8133 }
8134 return PyUnicode_Format(v, w);
8135}
8136
8137static PyNumberMethods unicode_as_number = {
8138 0, /*nb_add*/
8139 0, /*nb_subtract*/
8140 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008141 unicode_mod, /*nb_remainder*/
8142};
8143
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008145 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008146 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008147 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8148 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008149 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 0, /* sq_ass_item */
8151 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008152 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153};
8154
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008155static PyObject*
8156unicode_subscript(PyUnicodeObject* self, PyObject* item)
8157{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008158 if (PyIndex_Check(item)) {
8159 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008160 if (i == -1 && PyErr_Occurred())
8161 return NULL;
8162 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008163 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008164 return unicode_getitem(self, i);
8165 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008166 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008167 Py_UNICODE* source_buf;
8168 Py_UNICODE* result_buf;
8169 PyObject* result;
8170
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008171 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008172 &start, &stop, &step, &slicelength) < 0) {
8173 return NULL;
8174 }
8175
8176 if (slicelength <= 0) {
8177 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008178 } else if (start == 0 && step == 1 && slicelength == self->length &&
8179 PyUnicode_CheckExact(self)) {
8180 Py_INCREF(self);
8181 return (PyObject *)self;
8182 } else if (step == 1) {
8183 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008184 } else {
8185 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008186 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8187 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008188
8189 if (result_buf == NULL)
8190 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008191
8192 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8193 result_buf[i] = source_buf[cur];
8194 }
Tim Petersced69f82003-09-16 20:30:58 +00008195
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008196 result = PyUnicode_FromUnicode(result_buf, slicelength);
8197 PyMem_FREE(result_buf);
8198 return result;
8199 }
8200 } else {
8201 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8202 return NULL;
8203 }
8204}
8205
8206static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008207 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008208 (binaryfunc)unicode_subscript, /* mp_subscript */
8209 (objobjargproc)0, /* mp_ass_subscript */
8210};
8211
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213/* Helpers for PyUnicode_Format() */
8214
8215static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008216getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008218 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219 if (argidx < arglen) {
8220 (*p_argidx)++;
8221 if (arglen < 0)
8222 return args;
8223 else
8224 return PyTuple_GetItem(args, argidx);
8225 }
8226 PyErr_SetString(PyExc_TypeError,
8227 "not enough arguments for format string");
8228 return NULL;
8229}
8230
8231#define F_LJUST (1<<0)
8232#define F_SIGN (1<<1)
8233#define F_BLANK (1<<2)
8234#define F_ALT (1<<3)
8235#define F_ZERO (1<<4)
8236
Martin v. Löwis18e16552006-02-15 17:27:45 +00008237static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008238strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008240 register Py_ssize_t i;
8241 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 for (i = len - 1; i >= 0; i--)
8243 buffer[i] = (Py_UNICODE) charbuffer[i];
8244
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 return len;
8246}
8247
Neal Norwitzfc76d632006-01-10 06:03:13 +00008248static int
8249doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8250{
Tim Peters15231542006-02-16 01:08:01 +00008251 Py_ssize_t result;
8252
Neal Norwitzfc76d632006-01-10 06:03:13 +00008253 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008254 result = strtounicode(buffer, (char *)buffer);
8255 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008256}
8257
8258static int
8259longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8260{
Tim Peters15231542006-02-16 01:08:01 +00008261 Py_ssize_t result;
8262
Neal Norwitzfc76d632006-01-10 06:03:13 +00008263 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008264 result = strtounicode(buffer, (char *)buffer);
8265 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008266}
8267
Guido van Rossum078151d2002-08-11 04:24:12 +00008268/* XXX To save some code duplication, formatfloat/long/int could have been
8269 shared with stringobject.c, converting from 8-bit to Unicode after the
8270 formatting is done. */
8271
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272static int
8273formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008274 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 int flags,
8276 int prec,
8277 int type,
8278 PyObject *v)
8279{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008280 /* fmt = '%#.' + `prec` + `type`
8281 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 char fmt[20];
8283 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008284
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 x = PyFloat_AsDouble(v);
8286 if (x == -1.0 && PyErr_Occurred())
8287 return -1;
8288 if (prec < 0)
8289 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8291 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008292 /* Worst case length calc to ensure no buffer overrun:
8293
8294 'g' formats:
8295 fmt = %#.<prec>g
8296 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8297 for any double rep.)
8298 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8299
8300 'f' formats:
8301 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8302 len = 1 + 50 + 1 + prec = 52 + prec
8303
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008304 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008305 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008306
8307 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008308 if (((type == 'g' || type == 'G') &&
8309 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008310 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008311 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008312 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008313 return -1;
8314 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008315 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8316 (flags&F_ALT) ? "#" : "",
8317 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008318 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319}
8320
Tim Peters38fd5b62000-09-21 05:43:11 +00008321static PyObject*
8322formatlong(PyObject *val, int flags, int prec, int type)
8323{
8324 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008325 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008326 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008327 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008328
8329 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8330 if (!str)
8331 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008332 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008333 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008334 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008335}
8336
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337static int
8338formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008339 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 int flags,
8341 int prec,
8342 int type,
8343 PyObject *v)
8344{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008345 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008346 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8347 * + 1 + 1
8348 * = 24
8349 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008350 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008351 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352 long x;
8353
8354 x = PyInt_AsLong(v);
8355 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008356 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008357 if (x < 0 && type == 'u') {
8358 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008359 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008360 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8361 sign = "-";
8362 else
8363 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008365 prec = 1;
8366
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008367 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8368 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008369 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008370 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008371 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008372 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008373 return -1;
8374 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008375
8376 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008377 (type == 'x' || type == 'X' || type == 'o')) {
8378 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008379 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008380 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008381 * - when 0 is being converted, the C standard leaves off
8382 * the '0x' or '0X', which is inconsistent with other
8383 * %#x/%#X conversions and inconsistent with Python's
8384 * hex() function
8385 * - there are platforms that violate the standard and
8386 * convert 0 with the '0x' or '0X'
8387 * (Metrowerks, Compaq Tru64)
8388 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008389 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008390 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008391 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008392 * We can achieve the desired consistency by inserting our
8393 * own '0x' or '0X' prefix, and substituting %x/%X in place
8394 * of %#x/%#X.
8395 *
8396 * Note that this is the same approach as used in
8397 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008398 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008399 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8400 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008401 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008402 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008403 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8404 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008405 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008406 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008407 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008408 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008409 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008410 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411}
8412
8413static int
8414formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008415 size_t buflen,
8416 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008418 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008419 if (PyUnicode_Check(v)) {
8420 if (PyUnicode_GET_SIZE(v) != 1)
8421 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008425 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008426 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008427 goto onError;
8428 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430
8431 else {
8432 /* Integer input truncated to a character */
8433 long x;
8434 x = PyInt_AsLong(v);
8435 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008436 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008437#ifdef Py_UNICODE_WIDE
8438 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008439 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008440 "%c arg not in range(0x110000) "
8441 "(wide Python build)");
8442 return -1;
8443 }
8444#else
8445 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008446 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008447 "%c arg not in range(0x10000) "
8448 "(narrow Python build)");
8449 return -1;
8450 }
8451#endif
8452 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 }
8454 buf[1] = '\0';
8455 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008456
8457 onError:
8458 PyErr_SetString(PyExc_TypeError,
8459 "%c requires int or char");
8460 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461}
8462
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008463/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8464
8465 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8466 chars are formatted. XXX This is a magic number. Each formatting
8467 routine does bounds checking to ensure no overflow, but a better
8468 solution may be to malloc a buffer of appropriate size for each
8469 format. For now, the current solution is sufficient.
8470*/
8471#define FORMATBUFLEN (size_t)120
8472
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473PyObject *PyUnicode_Format(PyObject *format,
8474 PyObject *args)
8475{
8476 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008477 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478 int args_owned = 0;
8479 PyUnicodeObject *result = NULL;
8480 PyObject *dict = NULL;
8481 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008482
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483 if (format == NULL || args == NULL) {
8484 PyErr_BadInternalCall();
8485 return NULL;
8486 }
8487 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008488 if (uformat == NULL)
8489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490 fmt = PyUnicode_AS_UNICODE(uformat);
8491 fmtcnt = PyUnicode_GET_SIZE(uformat);
8492
8493 reslen = rescnt = fmtcnt + 100;
8494 result = _PyUnicode_New(reslen);
8495 if (result == NULL)
8496 goto onError;
8497 res = PyUnicode_AS_UNICODE(result);
8498
8499 if (PyTuple_Check(args)) {
8500 arglen = PyTuple_Size(args);
8501 argidx = 0;
8502 }
8503 else {
8504 arglen = -1;
8505 argidx = -2;
8506 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008507 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Guido van Rossum3172c5d2007-10-16 18:12:55 +00008508 !PyString_Check(args) && !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509 dict = args;
8510
8511 while (--fmtcnt >= 0) {
8512 if (*fmt != '%') {
8513 if (--rescnt < 0) {
8514 rescnt = fmtcnt + 100;
8515 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008516 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008517 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8519 --rescnt;
8520 }
8521 *res++ = *fmt++;
8522 }
8523 else {
8524 /* Got a format specifier */
8525 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008526 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528 Py_UNICODE c = '\0';
8529 Py_UNICODE fill;
8530 PyObject *v = NULL;
8531 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008532 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008534 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008535 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536
8537 fmt++;
8538 if (*fmt == '(') {
8539 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008540 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541 PyObject *key;
8542 int pcount = 1;
8543
8544 if (dict == NULL) {
8545 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008546 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547 goto onError;
8548 }
8549 ++fmt;
8550 --fmtcnt;
8551 keystart = fmt;
8552 /* Skip over balanced parentheses */
8553 while (pcount > 0 && --fmtcnt >= 0) {
8554 if (*fmt == ')')
8555 --pcount;
8556 else if (*fmt == '(')
8557 ++pcount;
8558 fmt++;
8559 }
8560 keylen = fmt - keystart - 1;
8561 if (fmtcnt < 0 || pcount > 0) {
8562 PyErr_SetString(PyExc_ValueError,
8563 "incomplete format key");
8564 goto onError;
8565 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008566#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008567 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568 then looked up since Python uses strings to hold
8569 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008570 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 key = PyUnicode_EncodeUTF8(keystart,
8572 keylen,
8573 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008574#else
8575 key = PyUnicode_FromUnicode(keystart, keylen);
8576#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577 if (key == NULL)
8578 goto onError;
8579 if (args_owned) {
8580 Py_DECREF(args);
8581 args_owned = 0;
8582 }
8583 args = PyObject_GetItem(dict, key);
8584 Py_DECREF(key);
8585 if (args == NULL) {
8586 goto onError;
8587 }
8588 args_owned = 1;
8589 arglen = -1;
8590 argidx = -2;
8591 }
8592 while (--fmtcnt >= 0) {
8593 switch (c = *fmt++) {
8594 case '-': flags |= F_LJUST; continue;
8595 case '+': flags |= F_SIGN; continue;
8596 case ' ': flags |= F_BLANK; continue;
8597 case '#': flags |= F_ALT; continue;
8598 case '0': flags |= F_ZERO; continue;
8599 }
8600 break;
8601 }
8602 if (c == '*') {
8603 v = getnextarg(args, arglen, &argidx);
8604 if (v == NULL)
8605 goto onError;
8606 if (!PyInt_Check(v)) {
8607 PyErr_SetString(PyExc_TypeError,
8608 "* wants int");
8609 goto onError;
8610 }
8611 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008612 if (width == -1 && PyErr_Occurred())
8613 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 if (width < 0) {
8615 flags |= F_LJUST;
8616 width = -width;
8617 }
8618 if (--fmtcnt >= 0)
8619 c = *fmt++;
8620 }
8621 else if (c >= '0' && c <= '9') {
8622 width = c - '0';
8623 while (--fmtcnt >= 0) {
8624 c = *fmt++;
8625 if (c < '0' || c > '9')
8626 break;
8627 if ((width*10) / 10 != width) {
8628 PyErr_SetString(PyExc_ValueError,
8629 "width too big");
8630 goto onError;
8631 }
8632 width = width*10 + (c - '0');
8633 }
8634 }
8635 if (c == '.') {
8636 prec = 0;
8637 if (--fmtcnt >= 0)
8638 c = *fmt++;
8639 if (c == '*') {
8640 v = getnextarg(args, arglen, &argidx);
8641 if (v == NULL)
8642 goto onError;
8643 if (!PyInt_Check(v)) {
8644 PyErr_SetString(PyExc_TypeError,
8645 "* wants int");
8646 goto onError;
8647 }
8648 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008649 if (prec == -1 && PyErr_Occurred())
8650 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651 if (prec < 0)
8652 prec = 0;
8653 if (--fmtcnt >= 0)
8654 c = *fmt++;
8655 }
8656 else if (c >= '0' && c <= '9') {
8657 prec = c - '0';
8658 while (--fmtcnt >= 0) {
8659 c = Py_CHARMASK(*fmt++);
8660 if (c < '0' || c > '9')
8661 break;
8662 if ((prec*10) / 10 != prec) {
8663 PyErr_SetString(PyExc_ValueError,
8664 "prec too big");
8665 goto onError;
8666 }
8667 prec = prec*10 + (c - '0');
8668 }
8669 }
8670 } /* prec */
8671 if (fmtcnt >= 0) {
8672 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 if (--fmtcnt >= 0)
8674 c = *fmt++;
8675 }
8676 }
8677 if (fmtcnt < 0) {
8678 PyErr_SetString(PyExc_ValueError,
8679 "incomplete format");
8680 goto onError;
8681 }
8682 if (c != '%') {
8683 v = getnextarg(args, arglen, &argidx);
8684 if (v == NULL)
8685 goto onError;
8686 }
8687 sign = 0;
8688 fill = ' ';
8689 switch (c) {
8690
8691 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008692 pbuf = formatbuf;
8693 /* presume that buffer length is at least 1 */
8694 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695 len = 1;
8696 break;
8697
8698 case 's':
8699 case 'r':
8700 if (PyUnicode_Check(v) && c == 's') {
8701 temp = v;
8702 Py_INCREF(temp);
8703 }
8704 else {
8705 PyObject *unicode;
8706 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008707 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708 else
8709 temp = PyObject_Repr(v);
8710 if (temp == NULL)
8711 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008712 if (PyUnicode_Check(temp))
8713 /* nothing to do */;
8714 else if (PyString_Check(temp)) {
8715 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008716 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008718 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008720 Py_DECREF(temp);
8721 temp = unicode;
8722 if (temp == NULL)
8723 goto onError;
8724 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008725 else {
8726 Py_DECREF(temp);
8727 PyErr_SetString(PyExc_TypeError,
8728 "%s argument has non-string str()");
8729 goto onError;
8730 }
8731 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008732 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733 len = PyUnicode_GET_SIZE(temp);
8734 if (prec >= 0 && len > prec)
8735 len = prec;
8736 break;
8737
8738 case 'i':
8739 case 'd':
8740 case 'u':
8741 case 'o':
8742 case 'x':
8743 case 'X':
8744 if (c == 'i')
8745 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008746 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008747 temp = formatlong(v, flags, prec, c);
8748 if (!temp)
8749 goto onError;
8750 pbuf = PyUnicode_AS_UNICODE(temp);
8751 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008752 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008754 else {
8755 pbuf = formatbuf;
8756 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8757 flags, prec, c, v);
8758 if (len < 0)
8759 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008760 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008761 }
8762 if (flags & F_ZERO)
8763 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764 break;
8765
8766 case 'e':
8767 case 'E':
8768 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008769 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770 case 'g':
8771 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008772 if (c == 'F')
8773 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008774 pbuf = formatbuf;
8775 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8776 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777 if (len < 0)
8778 goto onError;
8779 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008780 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781 fill = '0';
8782 break;
8783
8784 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008785 pbuf = formatbuf;
8786 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787 if (len < 0)
8788 goto onError;
8789 break;
8790
8791 default:
8792 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008793 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008794 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008795 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008796 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008797 (Py_ssize_t)(fmt - 1 -
8798 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799 goto onError;
8800 }
8801 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008802 if (*pbuf == '-' || *pbuf == '+') {
8803 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804 len--;
8805 }
8806 else if (flags & F_SIGN)
8807 sign = '+';
8808 else if (flags & F_BLANK)
8809 sign = ' ';
8810 else
8811 sign = 0;
8812 }
8813 if (width < len)
8814 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008815 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816 reslen -= rescnt;
8817 rescnt = width + fmtcnt + 100;
8818 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008819 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008820 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008821 PyErr_NoMemory();
8822 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008823 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008824 if (_PyUnicode_Resize(&result, reslen) < 0) {
8825 Py_XDECREF(temp);
8826 goto onError;
8827 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828 res = PyUnicode_AS_UNICODE(result)
8829 + reslen - rescnt;
8830 }
8831 if (sign) {
8832 if (fill != ' ')
8833 *res++ = sign;
8834 rescnt--;
8835 if (width > len)
8836 width--;
8837 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008838 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008839 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008840 assert(pbuf[1] == c);
8841 if (fill != ' ') {
8842 *res++ = *pbuf++;
8843 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008844 }
Tim Petersfff53252001-04-12 18:38:48 +00008845 rescnt -= 2;
8846 width -= 2;
8847 if (width < 0)
8848 width = 0;
8849 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008850 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851 if (width > len && !(flags & F_LJUST)) {
8852 do {
8853 --rescnt;
8854 *res++ = fill;
8855 } while (--width > len);
8856 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008857 if (fill == ' ') {
8858 if (sign)
8859 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008860 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008861 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008862 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008863 *res++ = *pbuf++;
8864 *res++ = *pbuf++;
8865 }
8866 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008867 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868 res += len;
8869 rescnt -= len;
8870 while (--width >= len) {
8871 --rescnt;
8872 *res++ = ' ';
8873 }
8874 if (dict && (argidx < arglen) && c != '%') {
8875 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008876 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008877 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878 goto onError;
8879 }
8880 Py_XDECREF(temp);
8881 } /* '%' */
8882 } /* until end */
8883 if (argidx < arglen && !dict) {
8884 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008885 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886 goto onError;
8887 }
8888
Thomas Woutersa96affe2006-03-12 00:29:36 +00008889 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8890 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891 if (args_owned) {
8892 Py_DECREF(args);
8893 }
8894 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895 return (PyObject *)result;
8896
8897 onError:
8898 Py_XDECREF(result);
8899 Py_DECREF(uformat);
8900 if (args_owned) {
8901 Py_DECREF(args);
8902 }
8903 return NULL;
8904}
8905
Jeremy Hylton938ace62002-07-17 16:30:39 +00008906static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008907unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8908
Tim Peters6d6c1a32001-08-02 04:15:00 +00008909static PyObject *
8910unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8911{
8912 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008913 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008914 char *encoding = NULL;
8915 char *errors = NULL;
8916
Guido van Rossume023fe02001-08-30 03:12:59 +00008917 if (type != &PyUnicode_Type)
8918 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008919 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8920 kwlist, &x, &encoding, &errors))
8921 return NULL;
8922 if (x == NULL)
8923 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008924 if (encoding == NULL && errors == NULL)
8925 return PyObject_Unicode(x);
8926 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008927 return PyUnicode_FromEncodedObject(x, encoding, errors);
8928}
8929
Guido van Rossume023fe02001-08-30 03:12:59 +00008930static PyObject *
8931unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8932{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008933 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008934 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008935
8936 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8937 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8938 if (tmp == NULL)
8939 return NULL;
8940 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008941 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008942 if (pnew == NULL) {
8943 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008944 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008945 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008946 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8947 if (pnew->str == NULL) {
8948 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008949 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008950 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008951 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008952 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008953 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8954 pnew->length = n;
8955 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008956 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008957 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008958}
8959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008960PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008961"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008962\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008963Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008964encoding defaults to the current default string encoding.\n\
8965errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008966
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008967static PyObject *unicode_iter(PyObject *seq);
8968
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008970 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008971 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972 sizeof(PyUnicodeObject), /* tp_size */
8973 0, /* tp_itemsize */
8974 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008975 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008977 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008979 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008980 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008981 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008983 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984 (hashfunc) unicode_hash, /* tp_hash*/
8985 0, /* tp_call*/
8986 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008987 PyObject_GenericGetAttr, /* tp_getattro */
8988 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00008989 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008990 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8991 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008992 unicode_doc, /* tp_doc */
8993 0, /* tp_traverse */
8994 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008995 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008996 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008997 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008998 0, /* tp_iternext */
8999 unicode_methods, /* tp_methods */
9000 0, /* tp_members */
9001 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009002 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009003 0, /* tp_dict */
9004 0, /* tp_descr_get */
9005 0, /* tp_descr_set */
9006 0, /* tp_dictoffset */
9007 0, /* tp_init */
9008 0, /* tp_alloc */
9009 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009010 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011};
9012
9013/* Initialize the Unicode implementation */
9014
Thomas Wouters78890102000-07-22 19:25:51 +00009015void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009017 int i;
9018
Thomas Wouters477c8d52006-05-27 19:21:47 +00009019 /* XXX - move this array to unicodectype.c ? */
9020 Py_UNICODE linebreak[] = {
9021 0x000A, /* LINE FEED */
9022 0x000D, /* CARRIAGE RETURN */
9023 0x001C, /* FILE SEPARATOR */
9024 0x001D, /* GROUP SEPARATOR */
9025 0x001E, /* RECORD SEPARATOR */
9026 0x0085, /* NEXT LINE */
9027 0x2028, /* LINE SEPARATOR */
9028 0x2029, /* PARAGRAPH SEPARATOR */
9029 };
9030
Fred Drakee4315f52000-05-09 19:53:39 +00009031 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009032 unicode_freelist = NULL;
9033 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009035 if (!unicode_empty)
9036 return;
9037
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009038 for (i = 0; i < 256; i++)
9039 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009040 if (PyType_Ready(&PyUnicode_Type) < 0)
9041 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009042
9043 /* initialize the linebreak bloom filter */
9044 bloom_linebreak = make_bloom_mask(
9045 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9046 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009047
9048 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049}
9050
9051/* Finalize the Unicode implementation */
9052
9053void
Thomas Wouters78890102000-07-22 19:25:51 +00009054_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009056 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009057 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009059 Py_XDECREF(unicode_empty);
9060 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009061
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009062 for (i = 0; i < 256; i++) {
9063 if (unicode_latin1[i]) {
9064 Py_DECREF(unicode_latin1[i]);
9065 unicode_latin1[i] = NULL;
9066 }
9067 }
9068
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009069 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070 PyUnicodeObject *v = u;
9071 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00009072 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00009073 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00009074 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009075 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009077 unicode_freelist = NULL;
9078 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009080
Walter Dörwald16807132007-05-25 13:52:07 +00009081void
9082PyUnicode_InternInPlace(PyObject **p)
9083{
9084 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9085 PyObject *t;
9086 if (s == NULL || !PyUnicode_Check(s))
9087 Py_FatalError(
9088 "PyUnicode_InternInPlace: unicode strings only please!");
9089 /* If it's a subclass, we don't really know what putting
9090 it in the interned dict might do. */
9091 if (!PyUnicode_CheckExact(s))
9092 return;
9093 if (PyUnicode_CHECK_INTERNED(s))
9094 return;
9095 if (interned == NULL) {
9096 interned = PyDict_New();
9097 if (interned == NULL) {
9098 PyErr_Clear(); /* Don't leave an exception */
9099 return;
9100 }
9101 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009102 /* It might be that the GetItem call fails even
9103 though the key is present in the dictionary,
9104 namely when this happens during a stack overflow. */
9105 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009106 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009107 Py_END_ALLOW_RECURSION
9108
Walter Dörwald16807132007-05-25 13:52:07 +00009109 if (t) {
9110 Py_INCREF(t);
9111 Py_DECREF(*p);
9112 *p = t;
9113 return;
9114 }
9115
Martin v. Löwis5b222132007-06-10 09:51:05 +00009116 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009117 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9118 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009119 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009120 return;
9121 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009122 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009123 /* The two references in interned are not counted by refcnt.
9124 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009125 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009126 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9127}
9128
9129void
9130PyUnicode_InternImmortal(PyObject **p)
9131{
9132 PyUnicode_InternInPlace(p);
9133 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9134 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9135 Py_INCREF(*p);
9136 }
9137}
9138
9139PyObject *
9140PyUnicode_InternFromString(const char *cp)
9141{
9142 PyObject *s = PyUnicode_FromString(cp);
9143 if (s == NULL)
9144 return NULL;
9145 PyUnicode_InternInPlace(&s);
9146 return s;
9147}
9148
9149void _Py_ReleaseInternedUnicodeStrings(void)
9150{
9151 PyObject *keys;
9152 PyUnicodeObject *s;
9153 Py_ssize_t i, n;
9154 Py_ssize_t immortal_size = 0, mortal_size = 0;
9155
9156 if (interned == NULL || !PyDict_Check(interned))
9157 return;
9158 keys = PyDict_Keys(interned);
9159 if (keys == NULL || !PyList_Check(keys)) {
9160 PyErr_Clear();
9161 return;
9162 }
9163
9164 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9165 detector, interned unicode strings are not forcibly deallocated;
9166 rather, we give them their stolen references back, and then clear
9167 and DECREF the interned dict. */
9168
9169 n = PyList_GET_SIZE(keys);
9170 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9171 n);
9172 for (i = 0; i < n; i++) {
9173 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9174 switch (s->state) {
9175 case SSTATE_NOT_INTERNED:
9176 /* XXX Shouldn't happen */
9177 break;
9178 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009179 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009180 immortal_size += s->length;
9181 break;
9182 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009183 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009184 mortal_size += s->length;
9185 break;
9186 default:
9187 Py_FatalError("Inconsistent interned string state.");
9188 }
9189 s->state = SSTATE_NOT_INTERNED;
9190 }
9191 fprintf(stderr, "total size of all interned strings: "
9192 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9193 "mortal/immortal\n", mortal_size, immortal_size);
9194 Py_DECREF(keys);
9195 PyDict_Clear(interned);
9196 Py_DECREF(interned);
9197 interned = NULL;
9198}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009199
9200
9201/********************* Unicode Iterator **************************/
9202
9203typedef struct {
9204 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009205 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009206 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9207} unicodeiterobject;
9208
9209static void
9210unicodeiter_dealloc(unicodeiterobject *it)
9211{
9212 _PyObject_GC_UNTRACK(it);
9213 Py_XDECREF(it->it_seq);
9214 PyObject_GC_Del(it);
9215}
9216
9217static int
9218unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9219{
9220 Py_VISIT(it->it_seq);
9221 return 0;
9222}
9223
9224static PyObject *
9225unicodeiter_next(unicodeiterobject *it)
9226{
9227 PyUnicodeObject *seq;
9228 PyObject *item;
9229
9230 assert(it != NULL);
9231 seq = it->it_seq;
9232 if (seq == NULL)
9233 return NULL;
9234 assert(PyUnicode_Check(seq));
9235
9236 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009237 item = PyUnicode_FromUnicode(
9238 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009239 if (item != NULL)
9240 ++it->it_index;
9241 return item;
9242 }
9243
9244 Py_DECREF(seq);
9245 it->it_seq = NULL;
9246 return NULL;
9247}
9248
9249static PyObject *
9250unicodeiter_len(unicodeiterobject *it)
9251{
9252 Py_ssize_t len = 0;
9253 if (it->it_seq)
9254 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9255 return PyInt_FromSsize_t(len);
9256}
9257
9258PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9259
9260static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009261 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9262 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009263 {NULL, NULL} /* sentinel */
9264};
9265
9266PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009267 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009268 "unicodeiterator", /* tp_name */
9269 sizeof(unicodeiterobject), /* tp_basicsize */
9270 0, /* tp_itemsize */
9271 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009272 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009273 0, /* tp_print */
9274 0, /* tp_getattr */
9275 0, /* tp_setattr */
9276 0, /* tp_compare */
9277 0, /* tp_repr */
9278 0, /* tp_as_number */
9279 0, /* tp_as_sequence */
9280 0, /* tp_as_mapping */
9281 0, /* tp_hash */
9282 0, /* tp_call */
9283 0, /* tp_str */
9284 PyObject_GenericGetAttr, /* tp_getattro */
9285 0, /* tp_setattro */
9286 0, /* tp_as_buffer */
9287 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9288 0, /* tp_doc */
9289 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9290 0, /* tp_clear */
9291 0, /* tp_richcompare */
9292 0, /* tp_weaklistoffset */
9293 PyObject_SelfIter, /* tp_iter */
9294 (iternextfunc)unicodeiter_next, /* tp_iternext */
9295 unicodeiter_methods, /* tp_methods */
9296 0,
9297};
9298
9299static PyObject *
9300unicode_iter(PyObject *seq)
9301{
9302 unicodeiterobject *it;
9303
9304 if (!PyUnicode_Check(seq)) {
9305 PyErr_BadInternalCall();
9306 return NULL;
9307 }
9308 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9309 if (it == NULL)
9310 return NULL;
9311 it->it_index = 0;
9312 Py_INCREF(seq);
9313 it->it_seq = (PyUnicodeObject *)seq;
9314 _PyObject_GC_TRACK(it);
9315 return (PyObject *)it;
9316}
9317
Martin v. Löwis5b222132007-06-10 09:51:05 +00009318size_t
9319Py_UNICODE_strlen(const Py_UNICODE *u)
9320{
9321 int res = 0;
9322 while(*u++)
9323 res++;
9324 return res;
9325}
9326
9327Py_UNICODE*
9328Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9329{
9330 Py_UNICODE *u = s1;
9331 while ((*u++ = *s2++));
9332 return s1;
9333}
9334
9335Py_UNICODE*
9336Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9337{
9338 Py_UNICODE *u = s1;
9339 while ((*u++ = *s2++))
9340 if (n-- == 0)
9341 break;
9342 return s1;
9343}
9344
9345int
9346Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9347{
9348 while (*s1 && *s2 && *s1 == *s2)
9349 s1++, s2++;
9350 if (*s1 && *s2)
9351 return (*s1 < *s2) ? -1 : +1;
9352 if (*s1)
9353 return 1;
9354 if (*s2)
9355 return -1;
9356 return 0;
9357}
9358
9359Py_UNICODE*
9360Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9361{
9362 const Py_UNICODE *p;
9363 for (p = s; *p; p++)
9364 if (*p == c)
9365 return (Py_UNICODE*)p;
9366 return NULL;
9367}
9368
9369
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009370#ifdef __cplusplus
9371}
9372#endif
9373
9374
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009375/*
9376Local variables:
9377c-basic-offset: 4
9378indent-tabs-mode: nil
9379End:
9380*/