blob: c568a8e091d198fadd6d4fa1d27215e673709cc9 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Eric Smith8c663262007-08-25 02:26:07 +000049#include "formatter_unicode.h"
50
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000051#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000052#include <windows.h>
53#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000054
Guido van Rossumd57fd912000-03-10 22:53:23 +000055/* Limit for the Unicode object free list */
56
57#define MAX_UNICODE_FREELIST_SIZE 1024
58
59/* Limit for the Unicode object free list stay alive optimization.
60
61 The implementation will keep allocated Unicode memory intact for
62 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000063 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Barry Warsaw51ac5802000-03-20 16:36:48 +000065 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000067 malloc()-overhead) bytes of unused garbage.
68
69 Setting the limit to 0 effectively turns the feature off.
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071 Note: This is an experimental feature ! If you get core dumps when
72 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000073
74*/
75
Guido van Rossumfd4b9572000-04-10 13:51:10 +000076#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
78/* Endianness switches; defaults to little endian */
79
80#ifdef WORDS_BIGENDIAN
81# define BYTEORDER_IS_BIG_ENDIAN
82#else
83# define BYTEORDER_IS_LITTLE_ENDIAN
84#endif
85
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086/* --- Globals ------------------------------------------------------------
87
88 The globals are initialized by the _PyUnicode_Init() API and should
89 not be used before calling that API.
90
91*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093
94#ifdef __cplusplus
95extern "C" {
96#endif
97
Walter Dörwald16807132007-05-25 13:52:07 +000098/* This dictionary holds all interned unicode strings. Note that references
99 to strings in this dictionary are *not* counted in the string's ob_refcnt.
100 When the interned string reaches a refcnt of 0 the string deallocation
101 function will delete the reference from this dictionary.
102
103 Another way to look at this is that to say that the actual reference
104 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
105*/
106static PyObject *interned;
107
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000109static PyUnicodeObject *unicode_freelist;
110static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000112/* The empty Unicode object is shared to improve performance. */
113static PyUnicodeObject *unicode_empty;
114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117static PyUnicodeObject *unicode_latin1[256];
118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000120 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000121 PyUnicode_GetDefaultEncoding() API to access this global.
122
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000123 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000124 hard coded default!
125*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000126static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000128Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000129PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000130{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000131#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000132 return 0x10FFFF;
133#else
134 /* This is actually an illegal character, so it should
135 not be passed to unichr. */
136 return 0xFFFF;
137#endif
138}
139
Thomas Wouters477c8d52006-05-27 19:21:47 +0000140/* --- Bloom Filters ----------------------------------------------------- */
141
142/* stuff to implement simple "bloom filters" for Unicode characters.
143 to keep things simple, we use a single bitmask, using the least 5
144 bits from each unicode characters as the bit index. */
145
146/* the linebreak mask is set up by Unicode_Init below */
147
148#define BLOOM_MASK unsigned long
149
150static BLOOM_MASK bloom_linebreak;
151
152#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
153
154#define BLOOM_LINEBREAK(ch)\
155 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
156
157Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
158{
159 /* calculate simple bloom-style bitmask for a given unicode string */
160
161 long mask;
162 Py_ssize_t i;
163
164 mask = 0;
165 for (i = 0; i < len; i++)
166 mask |= (1 << (ptr[i] & 0x1F));
167
168 return mask;
169}
170
171Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
172{
173 Py_ssize_t i;
174
175 for (i = 0; i < setlen; i++)
176 if (set[i] == chr)
177 return 1;
178
179 return 0;
180}
181
182#define BLOOM_MEMBER(mask, chr, set, setlen)\
183 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
184
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185/* --- Unicode Object ----------------------------------------------------- */
186
187static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000189 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190{
191 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000192
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000193 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 /* Resizing shared object (unicode_empty or single character
198 objects) in-place is not allowed. Use PyUnicode_Resize()
199 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 if (unicode == unicode_empty ||
202 (unicode->length == 1 &&
203 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000206 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 return -1;
208 }
209
Thomas Wouters477c8d52006-05-27 19:21:47 +0000210 /* We allocate one more byte to make sure the string is Ux0000 terminated.
211 The overallocation is also used by fastsearch, which assumes that it's
212 safe to look at str[length] (without making any assumptions about what
213 it contains). */
214
Guido van Rossumd57fd912000-03-10 22:53:23 +0000215 oldstr = unicode->str;
216 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
217 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 PyErr_NoMemory();
220 return -1;
221 }
222 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000223 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000225 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000227 if (unicode->defenc) {
228 Py_DECREF(unicode->defenc);
229 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 }
231 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000232
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 return 0;
234}
235
236/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000237 Ux0000 terminated; some code (e.g. new_identifier)
238 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239
240 XXX This allocator could further be enhanced by assuring that the
241 free list never reduces its size below 1.
242
243*/
244
245static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 register PyUnicodeObject *unicode;
249
Thomas Wouters477c8d52006-05-27 19:21:47 +0000250 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (length == 0 && unicode_empty != NULL) {
252 Py_INCREF(unicode_empty);
253 return unicode_empty;
254 }
255
256 /* Unicode freelist & memory allocation */
257 if (unicode_freelist) {
258 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000259 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Keep-Alive optimization: we only upsize the buffer,
263 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000264 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000265 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000266 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 }
269 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000270 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000272 }
273 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 }
275 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000276 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (unicode == NULL)
278 return NULL;
279 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
280 }
281
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000282 if (!unicode->str) {
283 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000284 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000285 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000286 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000287 * the caller fails before initializing str -- unicode_resize()
288 * reads str[0], and the Keep-Alive optimization can keep memory
289 * allocated for str alive across a call to unicode_dealloc(unicode).
290 * We don't want unicode_resize to read uninitialized memory in
291 * that case.
292 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000293 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000297 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000298 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000300
301 onError:
302 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000303 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305}
306
307static
Guido van Rossum9475a232001-10-05 20:51:39 +0000308void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309{
Walter Dörwald16807132007-05-25 13:52:07 +0000310 switch (PyUnicode_CHECK_INTERNED(unicode)) {
311 case SSTATE_NOT_INTERNED:
312 break;
313
314 case SSTATE_INTERNED_MORTAL:
315 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000316 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000317 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
318 Py_FatalError(
319 "deletion of interned unicode string failed");
320 break;
321
322 case SSTATE_INTERNED_IMMORTAL:
323 Py_FatalError("Immortal interned unicode string died.");
324
325 default:
326 Py_FatalError("Inconsistent interned unicode string state.");
327 }
328
Guido van Rossum604ddf82001-12-06 20:03:56 +0000329 if (PyUnicode_CheckExact(unicode) &&
330 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000331 /* Keep-Alive optimization */
332 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000333 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 unicode->str = NULL;
335 unicode->length = 0;
336 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000337 if (unicode->defenc) {
338 Py_DECREF(unicode->defenc);
339 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000340 }
341 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 *(PyUnicodeObject **)unicode = unicode_freelist;
343 unicode_freelist = unicode;
344 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000347 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000348 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000349 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 }
351}
352
Martin v. Löwis18e16552006-02-15 17:27:45 +0000353int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000354{
355 register PyUnicodeObject *v;
356
357 /* Argument checks */
358 if (unicode == NULL) {
359 PyErr_BadInternalCall();
360 return -1;
361 }
362 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000363 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 PyErr_BadInternalCall();
365 return -1;
366 }
367
368 /* Resizing unicode_empty and single character objects is not
369 possible since these are being shared. We simply return a fresh
370 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000371 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000372 (v == unicode_empty || v->length == 1)) {
373 PyUnicodeObject *w = _PyUnicode_New(length);
374 if (w == NULL)
375 return -1;
376 Py_UNICODE_COPY(w->str, v->str,
377 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000378 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 *unicode = (PyObject *)w;
380 return 0;
381 }
382
383 /* Note that we don't have to modify *unicode for unshared Unicode
384 objects, since we can modify them in-place. */
385 return unicode_resize(v, length);
386}
387
388/* Internal API for use in unicodeobject.c only ! */
389#define _PyUnicode_Resize(unicodevar, length) \
390 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
391
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000393 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394{
395 PyUnicodeObject *unicode;
396
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000397 /* If the Unicode data is known at construction time, we can apply
398 some optimizations which share commonly used objects. */
399 if (u != NULL) {
400
401 /* Optimization for empty strings */
402 if (size == 0 && unicode_empty != NULL) {
403 Py_INCREF(unicode_empty);
404 return (PyObject *)unicode_empty;
405 }
406
407 /* Single character Unicode objects in the Latin-1 range are
408 shared when using this constructor */
409 if (size == 1 && *u < 256) {
410 unicode = unicode_latin1[*u];
411 if (!unicode) {
412 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000413 if (!unicode)
414 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000415 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000416 unicode_latin1[*u] = unicode;
417 }
418 Py_INCREF(unicode);
419 return (PyObject *)unicode;
420 }
421 }
Tim Petersced69f82003-09-16 20:30:58 +0000422
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 unicode = _PyUnicode_New(size);
424 if (!unicode)
425 return NULL;
426
427 /* Copy the Unicode data into the new object */
428 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430
431 return (PyObject *)unicode;
432}
433
Walter Dörwaldd2034312007-05-18 16:29:38 +0000434PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000435{
436 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000437 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000438 some optimizations which share commonly used objects.
439 Also, this means the input must be UTF-8, so fall back to the
440 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000441 if (u != NULL) {
442
443 /* Optimization for empty strings */
444 if (size == 0 && unicode_empty != NULL) {
445 Py_INCREF(unicode_empty);
446 return (PyObject *)unicode_empty;
447 }
448
Martin v. Löwis9c121062007-08-05 20:26:11 +0000449 /* Single characters are shared when using this constructor.
450 Restrict to ASCII, since the input must be UTF-8. */
451 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000452 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000453 if (!unicode) {
454 unicode = _PyUnicode_New(1);
455 if (!unicode)
456 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000457 unicode->str[0] = Py_CHARMASK(*u);
458 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000459 }
460 Py_INCREF(unicode);
461 return (PyObject *)unicode;
462 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000463
464 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000465 }
466
Walter Dörwald55507312007-05-18 13:12:10 +0000467 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000468 if (!unicode)
469 return NULL;
470
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000471 return (PyObject *)unicode;
472}
473
Walter Dörwaldd2034312007-05-18 16:29:38 +0000474PyObject *PyUnicode_FromString(const char *u)
475{
476 size_t size = strlen(u);
477 if (size > PY_SSIZE_T_MAX) {
478 PyErr_SetString(PyExc_OverflowError, "input too long");
479 return NULL;
480 }
481
482 return PyUnicode_FromStringAndSize(u, size);
483}
484
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485#ifdef HAVE_WCHAR_H
486
487PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000488 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489{
490 PyUnicodeObject *unicode;
491
492 if (w == NULL) {
493 PyErr_BadInternalCall();
494 return NULL;
495 }
496
497 unicode = _PyUnicode_New(size);
498 if (!unicode)
499 return NULL;
500
501 /* Copy the wchar_t data into the new object */
502#ifdef HAVE_USABLE_WCHAR_T
503 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000504#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 {
506 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000507 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000509 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510 *u++ = *w++;
511 }
512#endif
513
514 return (PyObject *)unicode;
515}
516
Walter Dörwald346737f2007-05-31 10:44:43 +0000517static void
518makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
519{
520 *fmt++ = '%';
521 if (width) {
522 if (zeropad)
523 *fmt++ = '0';
524 fmt += sprintf(fmt, "%d", width);
525 }
526 if (precision)
527 fmt += sprintf(fmt, ".%d", precision);
528 if (longflag)
529 *fmt++ = 'l';
530 else if (size_tflag) {
531 char *f = PY_FORMAT_SIZE_T;
532 while (*f)
533 *fmt++ = *f++;
534 }
535 *fmt++ = c;
536 *fmt = '\0';
537}
538
Walter Dörwaldd2034312007-05-18 16:29:38 +0000539#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
540
541PyObject *
542PyUnicode_FromFormatV(const char *format, va_list vargs)
543{
544 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000545 Py_ssize_t callcount = 0;
546 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000547 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000548 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000549 int width = 0;
550 int precision = 0;
551 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000552 const char* f;
553 Py_UNICODE *s;
554 PyObject *string;
555 /* used by sprintf */
556 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000557 /* use abuffer instead of buffer, if we need more space
558 * (which can happen if there's a format specifier with width). */
559 char *abuffer = NULL;
560 char *realbuffer;
561 Py_ssize_t abuffersize = 0;
562 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000563 const char *copy;
564
565#ifdef VA_LIST_IS_ARRAY
566 Py_MEMCPY(count, vargs, sizeof(va_list));
567#else
568#ifdef __va_copy
569 __va_copy(count, vargs);
570#else
571 count = vargs;
572#endif
573#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 /* step 1: count the number of %S/%R format specifications
575 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
576 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000577 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000578 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000579 ++callcount;
580 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000581 /* step 2: allocate memory for the results of
582 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000583 if (callcount) {
584 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
585 if (!callresults) {
586 PyErr_NoMemory();
587 return NULL;
588 }
589 callresult = callresults;
590 }
591 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000592 for (f = format; *f; f++) {
593 if (*f == '%') {
594 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000595 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000596 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000597 width = (width*10) + *f++ - '0';
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000598 while (*++f && *f != '%' && !ISALPHA(*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000599 ;
600
601 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
602 * they don't affect the amount of space we reserve.
603 */
604 if ((*f == 'l' || *f == 'z') &&
605 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000606 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000607
608 switch (*f) {
609 case 'c':
610 (void)va_arg(count, int);
611 /* fall through... */
612 case '%':
613 n++;
614 break;
615 case 'd': case 'u': case 'i': case 'x':
616 (void) va_arg(count, int);
617 /* 20 bytes is enough to hold a 64-bit
618 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000619 This isn't enough for octal.
620 If a width is specified we need more
621 (which we allocate later). */
622 if (width < 20)
623 width = 20;
624 n += width;
625 if (abuffersize < width)
626 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000627 break;
628 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000629 {
630 /* UTF-8 */
631 unsigned char*s;
632 s = va_arg(count, unsigned char*);
633 while (*s) {
634 if (*s < 128) {
635 n++; s++;
636 } else if (*s < 0xc0) {
637 /* invalid UTF-8 */
638 n++; s++;
639 } else if (*s < 0xc0) {
640 n++;
641 s++; if(!*s)break;
642 s++;
643 } else if (*s < 0xe0) {
644 n++;
645 s++; if(!*s)break;
646 s++; if(!*s)break;
647 s++;
648 } else {
649 #ifdef Py_UNICODE_WIDE
650 n++;
651 #else
652 n+=2;
653 #endif
654 s++; if(!*s)break;
655 s++; if(!*s)break;
656 s++; if(!*s)break;
657 s++;
658 }
659 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000660 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000661 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000662 case 'U':
663 {
664 PyObject *obj = va_arg(count, PyObject *);
665 assert(obj && PyUnicode_Check(obj));
666 n += PyUnicode_GET_SIZE(obj);
667 break;
668 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000669 case 'V':
670 {
671 PyObject *obj = va_arg(count, PyObject *);
672 const char *str = va_arg(count, const char *);
673 assert(obj || str);
674 assert(!obj || PyUnicode_Check(obj));
675 if (obj)
676 n += PyUnicode_GET_SIZE(obj);
677 else
678 n += strlen(str);
679 break;
680 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000681 case 'S':
682 {
683 PyObject *obj = va_arg(count, PyObject *);
684 PyObject *str;
685 assert(obj);
686 str = PyObject_Unicode(obj);
687 if (!str)
688 goto fail;
689 n += PyUnicode_GET_SIZE(str);
690 /* Remember the str and switch to the next slot */
691 *callresult++ = str;
692 break;
693 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000694 case 'R':
695 {
696 PyObject *obj = va_arg(count, PyObject *);
697 PyObject *repr;
698 assert(obj);
699 repr = PyObject_Repr(obj);
700 if (!repr)
701 goto fail;
702 n += PyUnicode_GET_SIZE(repr);
703 /* Remember the repr and switch to the next slot */
704 *callresult++ = repr;
705 break;
706 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000707 case 'p':
708 (void) va_arg(count, int);
709 /* maximum 64-bit pointer representation:
710 * 0xffffffffffffffff
711 * so 19 characters is enough.
712 * XXX I count 18 -- what's the extra for?
713 */
714 n += 19;
715 break;
716 default:
717 /* if we stumble upon an unknown
718 formatting code, copy the rest of
719 the format string to the output
720 string. (we cannot just skip the
721 code, since there's no way to know
722 what's in the argument list) */
723 n += strlen(p);
724 goto expand;
725 }
726 } else
727 n++;
728 }
729 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000730 if (abuffersize > 20) {
731 abuffer = PyMem_Malloc(abuffersize);
732 if (!abuffer) {
733 PyErr_NoMemory();
734 goto fail;
735 }
736 realbuffer = abuffer;
737 }
738 else
739 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000740 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000741 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000742 we don't have to resize the string.
743 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 string = PyUnicode_FromUnicode(NULL, n);
745 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000746 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000747
748 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000749 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750
751 for (f = format; *f; f++) {
752 if (*f == '%') {
753 const char* p = f++;
754 int longflag = 0;
755 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000756 zeropad = (*f == '0');
757 /* parse the width.precision part */
758 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000759 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 width = (width*10) + *f++ - '0';
761 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 if (*f == '.') {
763 f++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000764 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767 /* handle the long flag, but only for %ld and %lu.
768 others can be added when necessary. */
769 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
770 longflag = 1;
771 ++f;
772 }
773 /* handle the size_t flag. */
774 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
775 size_tflag = 1;
776 ++f;
777 }
778
779 switch (*f) {
780 case 'c':
781 *s++ = va_arg(vargs, int);
782 break;
783 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000784 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000785 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000786 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000787 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000788 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000789 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000790 sprintf(realbuffer, fmt, va_arg(vargs, int));
791 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000792 break;
793 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000794 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000795 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000796 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000798 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000799 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000800 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
801 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000802 break;
803 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000804 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
805 sprintf(realbuffer, fmt, va_arg(vargs, int));
806 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000807 break;
808 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000809 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
810 sprintf(realbuffer, fmt, va_arg(vargs, int));
811 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000812 break;
813 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000814 {
815 /* Parameter must be UTF-8 encoded.
816 In case of encoding errors, use
817 the replacement character. */
818 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000819 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000820 u = PyUnicode_DecodeUTF8(p, strlen(p),
821 "replace");
822 if (!u)
823 goto fail;
824 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
825 PyUnicode_GET_SIZE(u));
826 s += PyUnicode_GET_SIZE(u);
827 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000829 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000830 case 'U':
831 {
832 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000833 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
834 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
835 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000836 break;
837 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000838 case 'V':
839 {
840 PyObject *obj = va_arg(vargs, PyObject *);
841 const char *str = va_arg(vargs, const char *);
842 if (obj) {
843 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
844 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
845 s += size;
846 } else {
847 appendstring(str);
848 }
849 break;
850 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000851 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000852 case 'R':
853 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000854 Py_UNICODE *ucopy;
855 Py_ssize_t usize;
856 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000857 /* unused, since we already have the result */
858 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000859 ucopy = PyUnicode_AS_UNICODE(*callresult);
860 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000861 for (upos = 0; upos<usize;)
862 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000863 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000864 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000865 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000866 ++callresult;
867 break;
868 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000869 case 'p':
870 sprintf(buffer, "%p", va_arg(vargs, void*));
871 /* %p is ill-defined: ensure leading 0x. */
872 if (buffer[1] == 'X')
873 buffer[1] = 'x';
874 else if (buffer[1] != 'x') {
875 memmove(buffer+2, buffer, strlen(buffer)+1);
876 buffer[0] = '0';
877 buffer[1] = 'x';
878 }
879 appendstring(buffer);
880 break;
881 case '%':
882 *s++ = '%';
883 break;
884 default:
885 appendstring(p);
886 goto end;
887 }
888 } else
889 *s++ = *f;
890 }
891
892 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000893 if (callresults)
894 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000895 if (abuffer)
896 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000897 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
898 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000899 fail:
900 if (callresults) {
901 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000902 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000903 Py_DECREF(*callresult2);
904 ++callresult2;
905 }
906 PyMem_Free(callresults);
907 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000908 if (abuffer)
909 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000910 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000911}
912
913#undef appendstring
914
915PyObject *
916PyUnicode_FromFormat(const char *format, ...)
917{
918 PyObject* ret;
919 va_list vargs;
920
921#ifdef HAVE_STDARG_PROTOTYPES
922 va_start(vargs, format);
923#else
924 va_start(vargs);
925#endif
926 ret = PyUnicode_FromFormatV(format, vargs);
927 va_end(vargs);
928 return ret;
929}
930
Martin v. Löwis18e16552006-02-15 17:27:45 +0000931Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
932 wchar_t *w,
933 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934{
935 if (unicode == NULL) {
936 PyErr_BadInternalCall();
937 return -1;
938 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000939
940 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000942 size = PyUnicode_GET_SIZE(unicode) + 1;
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944#ifdef HAVE_USABLE_WCHAR_T
945 memcpy(w, unicode->str, size * sizeof(wchar_t));
946#else
947 {
948 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000949 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000950 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000951 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000952 *w++ = *u++;
953 }
954#endif
955
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000956 if (size > PyUnicode_GET_SIZE(unicode))
957 return PyUnicode_GET_SIZE(unicode);
958 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000959 return size;
960}
961
962#endif
963
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000964PyObject *PyUnicode_FromOrdinal(int ordinal)
965{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000966 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000967
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000968 if (ordinal < 0 || ordinal > 0x10ffff) {
969 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000970 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000971 return NULL;
972 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000973
974#ifndef Py_UNICODE_WIDE
975 if (ordinal > 0xffff) {
976 ordinal -= 0x10000;
977 s[0] = 0xD800 | (ordinal >> 10);
978 s[1] = 0xDC00 | (ordinal & 0x3FF);
979 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000980 }
981#endif
982
Hye-Shik Chang40574832004-04-06 07:24:51 +0000983 s[0] = (Py_UNICODE)ordinal;
984 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000985}
986
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987PyObject *PyUnicode_FromObject(register PyObject *obj)
988{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000989 /* XXX Perhaps we should make this API an alias of
990 PyObject_Unicode() instead ?! */
991 if (PyUnicode_CheckExact(obj)) {
992 Py_INCREF(obj);
993 return obj;
994 }
995 if (PyUnicode_Check(obj)) {
996 /* For a Unicode subtype that's not a Unicode object,
997 return a true Unicode object with the same data. */
998 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
999 PyUnicode_GET_SIZE(obj));
1000 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001001 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1002}
1003
1004PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1005 const char *encoding,
1006 const char *errors)
1007{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001008 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001009 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001010 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001011
Guido van Rossumd57fd912000-03-10 22:53:23 +00001012 if (obj == NULL) {
1013 PyErr_BadInternalCall();
1014 return NULL;
1015 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001016
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001017 if (PyUnicode_Check(obj)) {
1018 PyErr_SetString(PyExc_TypeError,
1019 "decoding Unicode is not supported");
1020 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001021 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001022
1023 /* Coerce object */
1024 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001025 s = PyString_AS_STRING(obj);
1026 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001027 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001028 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1029 /* Overwrite the error message with something more useful in
1030 case of a TypeError. */
1031 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001032 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001033 "coercing to Unicode: need string or buffer, "
1034 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001035 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001036 goto onError;
1037 }
Tim Petersced69f82003-09-16 20:30:58 +00001038
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001039 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040 if (len == 0) {
1041 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001042 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 }
Tim Petersced69f82003-09-16 20:30:58 +00001044 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001045 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001046
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001047 return v;
1048
1049 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001050 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051}
1052
1053PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001054 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055 const char *encoding,
1056 const char *errors)
1057{
1058 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001059 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001060 char lower[20]; /* Enough for any encoding name we recognize */
1061 char *l;
1062 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001063
1064 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001065 encoding = PyUnicode_GetDefaultEncoding();
1066
1067 /* Convert encoding to lower case and replace '_' with '-' in order to
1068 catch e.g. UTF_8 */
1069 e = encoding;
1070 l = lower;
1071 while (*e && l < &lower[(sizeof lower) - 2]) {
1072 if (ISUPPER(*e)) {
1073 *l++ = TOLOWER(*e++);
1074 }
1075 else if (*e == '_') {
1076 *l++ = '-';
1077 e++;
1078 }
1079 else {
1080 *l++ = *e++;
1081 }
1082 }
1083 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001084
1085 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001086 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001088 else if ((strcmp(lower, "latin-1") == 0) ||
1089 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001090 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001091#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001092 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001093 return PyUnicode_DecodeMBCS(s, size, errors);
1094#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001095 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001096 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001097 else if (strcmp(lower, "utf-16") == 0)
1098 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1099 else if (strcmp(lower, "utf-32") == 0)
1100 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101
1102 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001103 buffer = NULL;
1104 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1105 goto onError;
1106 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001107 if (buffer == NULL)
1108 goto onError;
1109 unicode = PyCodec_Decode(buffer, encoding, errors);
1110 if (unicode == NULL)
1111 goto onError;
1112 if (!PyUnicode_Check(unicode)) {
1113 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001114 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001115 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 Py_DECREF(unicode);
1117 goto onError;
1118 }
1119 Py_DECREF(buffer);
1120 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001121
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 onError:
1123 Py_XDECREF(buffer);
1124 return NULL;
1125}
1126
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001127PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1128 const char *encoding,
1129 const char *errors)
1130{
1131 PyObject *v;
1132
1133 if (!PyUnicode_Check(unicode)) {
1134 PyErr_BadArgument();
1135 goto onError;
1136 }
1137
1138 if (encoding == NULL)
1139 encoding = PyUnicode_GetDefaultEncoding();
1140
1141 /* Decode via the codec registry */
1142 v = PyCodec_Decode(unicode, encoding, errors);
1143 if (v == NULL)
1144 goto onError;
1145 return v;
1146
1147 onError:
1148 return NULL;
1149}
1150
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001152 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 const char *encoding,
1154 const char *errors)
1155{
1156 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001157
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 unicode = PyUnicode_FromUnicode(s, size);
1159 if (unicode == NULL)
1160 return NULL;
1161 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1162 Py_DECREF(unicode);
1163 return v;
1164}
1165
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001166PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1167 const char *encoding,
1168 const char *errors)
1169{
1170 PyObject *v;
1171
1172 if (!PyUnicode_Check(unicode)) {
1173 PyErr_BadArgument();
1174 goto onError;
1175 }
1176
1177 if (encoding == NULL)
1178 encoding = PyUnicode_GetDefaultEncoding();
1179
1180 /* Encode via the codec registry */
1181 v = PyCodec_Encode(unicode, encoding, errors);
1182 if (v == NULL)
1183 goto onError;
1184 return v;
1185
1186 onError:
1187 return NULL;
1188}
1189
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1191 const char *encoding,
1192 const char *errors)
1193{
1194 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001195
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196 if (!PyUnicode_Check(unicode)) {
1197 PyErr_BadArgument();
1198 goto onError;
1199 }
Fred Drakee4315f52000-05-09 19:53:39 +00001200
Tim Petersced69f82003-09-16 20:30:58 +00001201 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001202 encoding = PyUnicode_GetDefaultEncoding();
1203
1204 /* Shortcuts for common default encodings */
1205 if (errors == NULL) {
1206 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001207 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001208 else if (strcmp(encoding, "latin-1") == 0)
1209 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001210#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1211 else if (strcmp(encoding, "mbcs") == 0)
1212 return PyUnicode_AsMBCSString(unicode);
1213#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001214 else if (strcmp(encoding, "ascii") == 0)
1215 return PyUnicode_AsASCIIString(unicode);
1216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218 /* Encode via the codec registry */
1219 v = PyCodec_Encode(unicode, encoding, errors);
1220 if (v == NULL)
1221 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001222 if (!PyBytes_Check(v)) {
1223 if (PyString_Check(v)) {
1224 /* Old codec, turn it into bytes */
1225 PyObject *b = PyBytes_FromObject(v);
1226 Py_DECREF(v);
1227 return b;
1228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001230 "encoder did not return a bytes object "
1231 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1232 v->ob_type->tp_name,
1233 encoding ? encoding : "NULL",
1234 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235 Py_DECREF(v);
1236 goto onError;
1237 }
1238 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001239
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 onError:
1241 return NULL;
1242}
1243
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001244PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1245 const char *errors)
1246{
1247 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001248 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001249 if (v)
1250 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001251 if (errors != NULL)
1252 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001253 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1254 PyUnicode_GET_SIZE(unicode),
1255 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001256 if (!b)
1257 return NULL;
1258 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1259 PyBytes_Size(b));
1260 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001261 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001262 return v;
1263}
1264
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001265PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001266PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001267 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001268 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1269}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001270
Christian Heimes5894ba72007-11-04 11:43:14 +00001271PyObject*
1272PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1273{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001274 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1275 can be undefined. If it is case, decode using UTF-8. The following assumes
1276 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1277 bootstrapping process where the codecs aren't ready yet.
1278 */
1279 if (Py_FileSystemDefaultEncoding) {
1280#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001281 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001282 return PyUnicode_DecodeMBCS(s, size, "replace");
1283 }
1284#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001285 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001286 return PyUnicode_DecodeUTF8(s, size, "replace");
1287 }
1288#endif
1289 return PyUnicode_Decode(s, size,
1290 Py_FileSystemDefaultEncoding,
1291 "replace");
1292 }
1293 else {
1294 return PyUnicode_DecodeUTF8(s, size, "replace");
1295 }
1296}
1297
Martin v. Löwis5b222132007-06-10 09:51:05 +00001298char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001299PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001300{
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001301 PyObject *str8;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001302 if (!PyUnicode_Check(unicode)) {
1303 PyErr_BadArgument();
1304 return NULL;
1305 }
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001306 str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1307 if (str8 == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001308 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001309 if (psize != NULL)
1310 *psize = PyString_GET_SIZE(str8);
1311 return PyString_AS_STRING(str8);
1312}
1313
1314char*
1315PyUnicode_AsString(PyObject *unicode)
1316{
1317 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001318}
1319
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1321{
1322 if (!PyUnicode_Check(unicode)) {
1323 PyErr_BadArgument();
1324 goto onError;
1325 }
1326 return PyUnicode_AS_UNICODE(unicode);
1327
1328 onError:
1329 return NULL;
1330}
1331
Martin v. Löwis18e16552006-02-15 17:27:45 +00001332Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333{
1334 if (!PyUnicode_Check(unicode)) {
1335 PyErr_BadArgument();
1336 goto onError;
1337 }
1338 return PyUnicode_GET_SIZE(unicode);
1339
1340 onError:
1341 return -1;
1342}
1343
Thomas Wouters78890102000-07-22 19:25:51 +00001344const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001345{
1346 return unicode_default_encoding;
1347}
1348
1349int PyUnicode_SetDefaultEncoding(const char *encoding)
1350{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001351 if (strcmp(encoding, unicode_default_encoding) != 0) {
1352 PyErr_Format(PyExc_ValueError,
1353 "Can only set default encoding to %s",
1354 unicode_default_encoding);
1355 return -1;
1356 }
Fred Drakee4315f52000-05-09 19:53:39 +00001357 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001358}
1359
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001360/* error handling callback helper:
1361 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001362 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001363 and adjust various state variables.
1364 return 0 on success, -1 on error
1365*/
1366
1367static
1368int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1369 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001370 const char **input, const char **inend, Py_ssize_t *startinpos,
1371 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001372 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001373{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001374 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375
1376 PyObject *restuple = NULL;
1377 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001378 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001379 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001380 Py_ssize_t requiredsize;
1381 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001382 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001383 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001384 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 int res = -1;
1386
1387 if (*errorHandler == NULL) {
1388 *errorHandler = PyCodec_LookupError(errors);
1389 if (*errorHandler == NULL)
1390 goto onError;
1391 }
1392
1393 if (*exceptionObject == NULL) {
1394 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001395 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001396 if (*exceptionObject == NULL)
1397 goto onError;
1398 }
1399 else {
1400 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1401 goto onError;
1402 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1403 goto onError;
1404 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1405 goto onError;
1406 }
1407
1408 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1409 if (restuple == NULL)
1410 goto onError;
1411 if (!PyTuple_Check(restuple)) {
1412 PyErr_Format(PyExc_TypeError, &argparse[4]);
1413 goto onError;
1414 }
1415 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1416 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001417
1418 /* Copy back the bytes variables, which might have been modified by the
1419 callback */
1420 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1421 if (!inputobj)
1422 goto onError;
1423 if (!PyBytes_Check(inputobj)) {
1424 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1425 }
1426 *input = PyBytes_AS_STRING(inputobj);
1427 insize = PyBytes_GET_SIZE(inputobj);
1428 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001429 /* we can DECREF safely, as the exception has another reference,
1430 so the object won't go away. */
1431 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001432
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001434 newpos = insize+newpos;
1435 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001436 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001437 goto onError;
1438 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001439
1440 /* need more space? (at least enough for what we
1441 have+the replacement+the rest of the string (starting
1442 at the new input position), so we won't have to check space
1443 when there are no errors in the rest of the string) */
1444 repptr = PyUnicode_AS_UNICODE(repunicode);
1445 repsize = PyUnicode_GET_SIZE(repunicode);
1446 requiredsize = *outpos + repsize + insize-newpos;
1447 if (requiredsize > outsize) {
1448 if (requiredsize<2*outsize)
1449 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001450 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001451 goto onError;
1452 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1453 }
1454 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001455 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001456 Py_UNICODE_COPY(*outptr, repptr, repsize);
1457 *outptr += repsize;
1458 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001459
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001460 /* we made it! */
1461 res = 0;
1462
1463 onError:
1464 Py_XDECREF(restuple);
1465 return res;
1466}
1467
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001468/* --- UTF-7 Codec -------------------------------------------------------- */
1469
1470/* see RFC2152 for details */
1471
Tim Petersced69f82003-09-16 20:30:58 +00001472static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001473char utf7_special[128] = {
1474 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1475 encoded:
1476 0 - not special
1477 1 - special
1478 2 - whitespace (optional)
1479 3 - RFC2152 Set O (optional) */
1480 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1481 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1482 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1484 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1486 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1488
1489};
1490
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001491/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1492 warnings about the comparison always being false; since
1493 utf7_special[0] is 1, we can safely make that one comparison
1494 true */
1495
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001496#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001497 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001498 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001499 (encodeO && (utf7_special[(c)] == 3)))
1500
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001501#define B64(n) \
1502 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1503#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001504 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001505#define UB64(c) \
1506 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1507 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001508
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001509#define ENCODE(out, ch, bits) \
1510 while (bits >= 6) { \
1511 *out++ = B64(ch >> (bits-6)); \
1512 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001513 }
1514
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001515#define DECODE(out, ch, bits, surrogate) \
1516 while (bits >= 16) { \
1517 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1518 bits -= 16; \
1519 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001520 /* We have already generated an error for the high surrogate \
1521 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001522 surrogate = 0; \
1523 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001524 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001525 it in a 16-bit character */ \
1526 surrogate = 1; \
1527 errmsg = "code pairs are not supported"; \
1528 goto utf7Error; \
1529 } else { \
1530 *out++ = outCh; \
1531 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001532 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001533
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001534PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001535 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001536 const char *errors)
1537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001538 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001539 Py_ssize_t startinpos;
1540 Py_ssize_t endinpos;
1541 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001542 const char *e;
1543 PyUnicodeObject *unicode;
1544 Py_UNICODE *p;
1545 const char *errmsg = "";
1546 int inShift = 0;
1547 unsigned int bitsleft = 0;
1548 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549 int surrogate = 0;
1550 PyObject *errorHandler = NULL;
1551 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001552
1553 unicode = _PyUnicode_New(size);
1554 if (!unicode)
1555 return NULL;
1556 if (size == 0)
1557 return (PyObject *)unicode;
1558
1559 p = unicode->str;
1560 e = s + size;
1561
1562 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001563 Py_UNICODE ch;
1564 restart:
1565 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001566
1567 if (inShift) {
1568 if ((ch == '-') || !B64CHAR(ch)) {
1569 inShift = 0;
1570 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001571
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001572 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1573 if (bitsleft >= 6) {
1574 /* The shift sequence has a partial character in it. If
1575 bitsleft < 6 then we could just classify it as padding
1576 but that is not the case here */
1577
1578 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001579 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001580 }
1581 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001582 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 here so indicate the potential of a misencoded character. */
1584
1585 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1586 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1587 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001588 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001589 }
1590
1591 if (ch == '-') {
1592 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001593 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594 inShift = 1;
1595 }
1596 } else if (SPECIAL(ch,0,0)) {
1597 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001598 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001599 } else {
1600 *p++ = ch;
1601 }
1602 } else {
1603 charsleft = (charsleft << 6) | UB64(ch);
1604 bitsleft += 6;
1605 s++;
1606 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1607 }
1608 }
1609 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001610 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611 s++;
1612 if (s < e && *s == '-') {
1613 s++;
1614 *p++ = '+';
1615 } else
1616 {
1617 inShift = 1;
1618 bitsleft = 0;
1619 }
1620 }
1621 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001622 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001623 errmsg = "unexpected special character";
1624 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001625 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626 }
1627 else {
1628 *p++ = ch;
1629 s++;
1630 }
1631 continue;
1632 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 outpos = p-PyUnicode_AS_UNICODE(unicode);
1634 endinpos = s-starts;
1635 if (unicode_decode_call_errorhandler(
1636 errors, &errorHandler,
1637 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001638 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001639 (PyObject **)&unicode, &outpos, &p))
1640 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001641 }
1642
1643 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001644 outpos = p-PyUnicode_AS_UNICODE(unicode);
1645 endinpos = size;
1646 if (unicode_decode_call_errorhandler(
1647 errors, &errorHandler,
1648 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001649 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001650 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001651 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001652 if (s < e)
1653 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001654 }
1655
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001656 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657 goto onError;
1658
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001659 Py_XDECREF(errorHandler);
1660 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001661 return (PyObject *)unicode;
1662
1663onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 Py_XDECREF(errorHandler);
1665 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001666 Py_DECREF(unicode);
1667 return NULL;
1668}
1669
1670
1671PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001672 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673 int encodeSetO,
1674 int encodeWhiteSpace,
1675 const char *errors)
1676{
1677 PyObject *v;
1678 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001679 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001680 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001681 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001682 unsigned int bitsleft = 0;
1683 unsigned long charsleft = 0;
1684 char * out;
1685 char * start;
1686
1687 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001688 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689
Walter Dörwald51ab4142007-05-05 14:43:36 +00001690 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 if (v == NULL)
1692 return NULL;
1693
Walter Dörwald51ab4142007-05-05 14:43:36 +00001694 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 for (;i < size; ++i) {
1696 Py_UNICODE ch = s[i];
1697
1698 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001699 if (ch == '+') {
1700 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001701 *out++ = '-';
1702 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1703 charsleft = ch;
1704 bitsleft = 16;
1705 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001706 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001707 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001708 } else {
1709 *out++ = (char) ch;
1710 }
1711 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001712 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1713 *out++ = B64(charsleft << (6-bitsleft));
1714 charsleft = 0;
1715 bitsleft = 0;
1716 /* Characters not in the BASE64 set implicitly unshift the sequence
1717 so no '-' is required, except if the character is itself a '-' */
1718 if (B64CHAR(ch) || ch == '-') {
1719 *out++ = '-';
1720 }
1721 inShift = 0;
1722 *out++ = (char) ch;
1723 } else {
1724 bitsleft += 16;
1725 charsleft = (charsleft << 16) | ch;
1726 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1727
1728 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001729 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001730 or '-' then the shift sequence will be terminated implicitly and we
1731 don't have to insert a '-'. */
1732
1733 if (bitsleft == 0) {
1734 if (i + 1 < size) {
1735 Py_UNICODE ch2 = s[i+1];
1736
1737 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001738
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001739 } else if (B64CHAR(ch2) || ch2 == '-') {
1740 *out++ = '-';
1741 inShift = 0;
1742 } else {
1743 inShift = 0;
1744 }
1745
1746 }
1747 else {
1748 *out++ = '-';
1749 inShift = 0;
1750 }
1751 }
Tim Petersced69f82003-09-16 20:30:58 +00001752 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001753 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001754 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001755 if (bitsleft) {
1756 *out++= B64(charsleft << (6-bitsleft) );
1757 *out++ = '-';
1758 }
1759
Walter Dörwald51ab4142007-05-05 14:43:36 +00001760 if (PyBytes_Resize(v, out - start)) {
1761 Py_DECREF(v);
1762 return NULL;
1763 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001764 return v;
1765}
1766
1767#undef SPECIAL
1768#undef B64
1769#undef B64CHAR
1770#undef UB64
1771#undef ENCODE
1772#undef DECODE
1773
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774/* --- UTF-8 Codec -------------------------------------------------------- */
1775
Tim Petersced69f82003-09-16 20:30:58 +00001776static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777char utf8_code_length[256] = {
1778 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1779 illegal prefix. see RFC 2279 for details */
1780 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1781 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1782 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1783 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1784 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1785 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1786 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1787 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1788 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1789 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1790 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1791 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1792 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1793 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1794 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1795 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1796};
1797
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001799 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 const char *errors)
1801{
Walter Dörwald69652032004-09-07 20:24:22 +00001802 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1803}
1804
1805PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001806 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001807 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001808 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001809{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001812 Py_ssize_t startinpos;
1813 Py_ssize_t endinpos;
1814 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 const char *e;
1816 PyUnicodeObject *unicode;
1817 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001818 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001819 PyObject *errorHandler = NULL;
1820 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821
1822 /* Note: size will always be longer than the resulting Unicode
1823 character count */
1824 unicode = _PyUnicode_New(size);
1825 if (!unicode)
1826 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001827 if (size == 0) {
1828 if (consumed)
1829 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001830 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001831 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001832
1833 /* Unpack UTF-8 encoded data */
1834 p = unicode->str;
1835 e = s + size;
1836
1837 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001838 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839
1840 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001841 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 s++;
1843 continue;
1844 }
1845
1846 n = utf8_code_length[ch];
1847
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001848 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001849 if (consumed)
1850 break;
1851 else {
1852 errmsg = "unexpected end of data";
1853 startinpos = s-starts;
1854 endinpos = size;
1855 goto utf8Error;
1856 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858
1859 switch (n) {
1860
1861 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001862 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 startinpos = s-starts;
1864 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001865 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866
1867 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001868 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 startinpos = s-starts;
1870 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001871 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872
1873 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001874 if ((s[1] & 0xc0) != 0x80) {
1875 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 startinpos = s-starts;
1877 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001878 goto utf8Error;
1879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001881 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001882 startinpos = s-starts;
1883 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001884 errmsg = "illegal encoding";
1885 goto utf8Error;
1886 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001888 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889 break;
1890
1891 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001892 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001893 (s[2] & 0xc0) != 0x80) {
1894 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 startinpos = s-starts;
1896 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001897 goto utf8Error;
1898 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001900 if (ch < 0x0800) {
1901 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001902 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001903
1904 XXX For wide builds (UCS-4) we should probably try
1905 to recombine the surrogates into a single code
1906 unit.
1907 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001908 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001909 startinpos = s-starts;
1910 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001911 goto utf8Error;
1912 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001914 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001915 break;
1916
1917 case 4:
1918 if ((s[1] & 0xc0) != 0x80 ||
1919 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001920 (s[3] & 0xc0) != 0x80) {
1921 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001922 startinpos = s-starts;
1923 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001924 goto utf8Error;
1925 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001926 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1927 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1928 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001929 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001930 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001931 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001932 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001933 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001934 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001935 startinpos = s-starts;
1936 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001937 goto utf8Error;
1938 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001939#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001940 *p++ = (Py_UNICODE)ch;
1941#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001942 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001943
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001944 /* translate from 10000..10FFFF to 0..FFFF */
1945 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001946
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001947 /* high surrogate = top 10 bits added to D800 */
1948 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001949
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001950 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001951 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001952#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953 break;
1954
1955 default:
1956 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001957 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001958 startinpos = s-starts;
1959 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001960 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961 }
1962 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001963 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001964
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001965 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001966 outpos = p-PyUnicode_AS_UNICODE(unicode);
1967 if (unicode_decode_call_errorhandler(
1968 errors, &errorHandler,
1969 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001970 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001971 (PyObject **)&unicode, &outpos, &p))
1972 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973 }
Walter Dörwald69652032004-09-07 20:24:22 +00001974 if (consumed)
1975 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976
1977 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001978 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 goto onError;
1980
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001981 Py_XDECREF(errorHandler);
1982 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983 return (PyObject *)unicode;
1984
1985onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001986 Py_XDECREF(errorHandler);
1987 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 Py_DECREF(unicode);
1989 return NULL;
1990}
1991
Tim Peters602f7402002-04-27 18:03:26 +00001992/* Allocation strategy: if the string is short, convert into a stack buffer
1993 and allocate exactly as much space needed at the end. Else allocate the
1994 maximum possible needed (4 result bytes per Unicode character), and return
1995 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001996*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001997PyObject *
1998PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001999 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002000 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001{
Tim Peters602f7402002-04-27 18:03:26 +00002002#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002003
Martin v. Löwis18e16552006-02-15 17:27:45 +00002004 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002005 PyObject *v; /* result string object */
2006 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002007 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002008 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002009 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002010
Tim Peters602f7402002-04-27 18:03:26 +00002011 assert(s != NULL);
2012 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013
Tim Peters602f7402002-04-27 18:03:26 +00002014 if (size <= MAX_SHORT_UNICHARS) {
2015 /* Write into the stack buffer; nallocated can't overflow.
2016 * At the end, we'll allocate exactly as much heap space as it
2017 * turns out we need.
2018 */
2019 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2020 v = NULL; /* will allocate after we're done */
2021 p = stackbuf;
2022 }
2023 else {
2024 /* Overallocate on the heap, and give the excess back at the end. */
2025 nallocated = size * 4;
2026 if (nallocated / 4 != size) /* overflow! */
2027 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002028 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002029 if (v == NULL)
2030 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002031 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002032 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002033
Tim Peters602f7402002-04-27 18:03:26 +00002034 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002035 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002036
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002037 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002038 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002040
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002042 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002043 *p++ = (char)(0xc0 | (ch >> 6));
2044 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002045 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002046 else {
Tim Peters602f7402002-04-27 18:03:26 +00002047 /* Encode UCS2 Unicode ordinals */
2048 if (ch < 0x10000) {
2049 /* Special case: check for high surrogate */
2050 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2051 Py_UCS4 ch2 = s[i];
2052 /* Check for low surrogate and combine the two to
2053 form a UCS4 value */
2054 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002055 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002056 i++;
2057 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002058 }
Tim Peters602f7402002-04-27 18:03:26 +00002059 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002060 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002061 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002062 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2063 *p++ = (char)(0x80 | (ch & 0x3f));
2064 continue;
2065 }
2066encodeUCS4:
2067 /* Encode UCS4 Unicode ordinals */
2068 *p++ = (char)(0xf0 | (ch >> 18));
2069 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2070 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2071 *p++ = (char)(0x80 | (ch & 0x3f));
2072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002074
Tim Peters602f7402002-04-27 18:03:26 +00002075 if (v == NULL) {
2076 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002077 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002078 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002079 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002080 }
2081 else {
2082 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002083 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002084 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002085 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002088
Tim Peters602f7402002-04-27 18:03:26 +00002089#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090}
2091
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2093{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 if (!PyUnicode_Check(unicode)) {
2095 PyErr_BadArgument();
2096 return NULL;
2097 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002098 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2099 PyUnicode_GET_SIZE(unicode),
2100 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101}
2102
Walter Dörwald41980ca2007-08-16 21:55:45 +00002103/* --- UTF-32 Codec ------------------------------------------------------- */
2104
2105PyObject *
2106PyUnicode_DecodeUTF32(const char *s,
2107 Py_ssize_t size,
2108 const char *errors,
2109 int *byteorder)
2110{
2111 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2112}
2113
2114PyObject *
2115PyUnicode_DecodeUTF32Stateful(const char *s,
2116 Py_ssize_t size,
2117 const char *errors,
2118 int *byteorder,
2119 Py_ssize_t *consumed)
2120{
2121 const char *starts = s;
2122 Py_ssize_t startinpos;
2123 Py_ssize_t endinpos;
2124 Py_ssize_t outpos;
2125 PyUnicodeObject *unicode;
2126 Py_UNICODE *p;
2127#ifndef Py_UNICODE_WIDE
2128 int i, pairs;
2129#else
2130 const int pairs = 0;
2131#endif
2132 const unsigned char *q, *e;
2133 int bo = 0; /* assume native ordering by default */
2134 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002135 /* Offsets from q for retrieving bytes in the right order. */
2136#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2137 int iorder[] = {0, 1, 2, 3};
2138#else
2139 int iorder[] = {3, 2, 1, 0};
2140#endif
2141 PyObject *errorHandler = NULL;
2142 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002143 /* On narrow builds we split characters outside the BMP into two
2144 codepoints => count how much extra space we need. */
2145#ifndef Py_UNICODE_WIDE
2146 for (i = pairs = 0; i < size/4; i++)
2147 if (((Py_UCS4 *)s)[i] >= 0x10000)
2148 pairs++;
2149#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002150
2151 /* This might be one to much, because of a BOM */
2152 unicode = _PyUnicode_New((size+3)/4+pairs);
2153 if (!unicode)
2154 return NULL;
2155 if (size == 0)
2156 return (PyObject *)unicode;
2157
2158 /* Unpack UTF-32 encoded data */
2159 p = unicode->str;
2160 q = (unsigned char *)s;
2161 e = q + size;
2162
2163 if (byteorder)
2164 bo = *byteorder;
2165
2166 /* Check for BOM marks (U+FEFF) in the input and adjust current
2167 byte order setting accordingly. In native mode, the leading BOM
2168 mark is skipped, in all other modes, it is copied to the output
2169 stream as-is (giving a ZWNBSP character). */
2170 if (bo == 0) {
2171 if (size >= 4) {
2172 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2173 (q[iorder[1]] << 8) | q[iorder[0]];
2174#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2175 if (bom == 0x0000FEFF) {
2176 q += 4;
2177 bo = -1;
2178 }
2179 else if (bom == 0xFFFE0000) {
2180 q += 4;
2181 bo = 1;
2182 }
2183#else
2184 if (bom == 0x0000FEFF) {
2185 q += 4;
2186 bo = 1;
2187 }
2188 else if (bom == 0xFFFE0000) {
2189 q += 4;
2190 bo = -1;
2191 }
2192#endif
2193 }
2194 }
2195
2196 if (bo == -1) {
2197 /* force LE */
2198 iorder[0] = 0;
2199 iorder[1] = 1;
2200 iorder[2] = 2;
2201 iorder[3] = 3;
2202 }
2203 else if (bo == 1) {
2204 /* force BE */
2205 iorder[0] = 3;
2206 iorder[1] = 2;
2207 iorder[2] = 1;
2208 iorder[3] = 0;
2209 }
2210
2211 while (q < e) {
2212 Py_UCS4 ch;
2213 /* remaining bytes at the end? (size should be divisible by 4) */
2214 if (e-q<4) {
2215 if (consumed)
2216 break;
2217 errmsg = "truncated data";
2218 startinpos = ((const char *)q)-starts;
2219 endinpos = ((const char *)e)-starts;
2220 goto utf32Error;
2221 /* The remaining input chars are ignored if the callback
2222 chooses to skip the input */
2223 }
2224 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2225 (q[iorder[1]] << 8) | q[iorder[0]];
2226
2227 if (ch >= 0x110000)
2228 {
2229 errmsg = "codepoint not in range(0x110000)";
2230 startinpos = ((const char *)q)-starts;
2231 endinpos = startinpos+4;
2232 goto utf32Error;
2233 }
2234#ifndef Py_UNICODE_WIDE
2235 if (ch >= 0x10000)
2236 {
2237 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2238 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2239 }
2240 else
2241#endif
2242 *p++ = ch;
2243 q += 4;
2244 continue;
2245 utf32Error:
2246 outpos = p-PyUnicode_AS_UNICODE(unicode);
2247 if (unicode_decode_call_errorhandler(
2248 errors, &errorHandler,
2249 "utf32", errmsg,
2250 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2251 (PyObject **)&unicode, &outpos, &p))
2252 goto onError;
2253 }
2254
2255 if (byteorder)
2256 *byteorder = bo;
2257
2258 if (consumed)
2259 *consumed = (const char *)q-starts;
2260
2261 /* Adjust length */
2262 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2263 goto onError;
2264
2265 Py_XDECREF(errorHandler);
2266 Py_XDECREF(exc);
2267 return (PyObject *)unicode;
2268
2269onError:
2270 Py_DECREF(unicode);
2271 Py_XDECREF(errorHandler);
2272 Py_XDECREF(exc);
2273 return NULL;
2274}
2275
2276PyObject *
2277PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2278 Py_ssize_t size,
2279 const char *errors,
2280 int byteorder)
2281{
2282 PyObject *v;
2283 unsigned char *p;
2284#ifndef Py_UNICODE_WIDE
2285 int i, pairs;
2286#else
2287 const int pairs = 0;
2288#endif
2289 /* Offsets from p for storing byte pairs in the right order. */
2290#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2291 int iorder[] = {0, 1, 2, 3};
2292#else
2293 int iorder[] = {3, 2, 1, 0};
2294#endif
2295
2296#define STORECHAR(CH) \
2297 do { \
2298 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2299 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2300 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2301 p[iorder[0]] = (CH) & 0xff; \
2302 p += 4; \
2303 } while(0)
2304
2305 /* In narrow builds we can output surrogate pairs as one codepoint,
2306 so we need less space. */
2307#ifndef Py_UNICODE_WIDE
2308 for (i = pairs = 0; i < size-1; i++)
2309 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2310 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2311 pairs++;
2312#endif
2313 v = PyBytes_FromStringAndSize(NULL,
2314 4 * (size - pairs + (byteorder == 0)));
2315 if (v == NULL)
2316 return NULL;
2317
2318 p = (unsigned char *)PyBytes_AS_STRING(v);
2319 if (byteorder == 0)
2320 STORECHAR(0xFEFF);
2321 if (size == 0)
2322 return v;
2323
2324 if (byteorder == -1) {
2325 /* force LE */
2326 iorder[0] = 0;
2327 iorder[1] = 1;
2328 iorder[2] = 2;
2329 iorder[3] = 3;
2330 }
2331 else if (byteorder == 1) {
2332 /* force BE */
2333 iorder[0] = 3;
2334 iorder[1] = 2;
2335 iorder[2] = 1;
2336 iorder[3] = 0;
2337 }
2338
2339 while (size-- > 0) {
2340 Py_UCS4 ch = *s++;
2341#ifndef Py_UNICODE_WIDE
2342 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2343 Py_UCS4 ch2 = *s;
2344 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2345 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2346 s++;
2347 size--;
2348 }
2349 }
2350#endif
2351 STORECHAR(ch);
2352 }
2353 return v;
2354#undef STORECHAR
2355}
2356
2357PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2358{
2359 if (!PyUnicode_Check(unicode)) {
2360 PyErr_BadArgument();
2361 return NULL;
2362 }
2363 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2364 PyUnicode_GET_SIZE(unicode),
2365 NULL,
2366 0);
2367}
2368
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369/* --- UTF-16 Codec ------------------------------------------------------- */
2370
Tim Peters772747b2001-08-09 22:21:55 +00002371PyObject *
2372PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002373 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002374 const char *errors,
2375 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376{
Walter Dörwald69652032004-09-07 20:24:22 +00002377 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2378}
2379
2380PyObject *
2381PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002382 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002383 const char *errors,
2384 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002385 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002386{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002387 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002388 Py_ssize_t startinpos;
2389 Py_ssize_t endinpos;
2390 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002391 PyUnicodeObject *unicode;
2392 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002393 const unsigned char *q, *e;
2394 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002395 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002396 /* Offsets from q for retrieving byte pairs in the right order. */
2397#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2398 int ihi = 1, ilo = 0;
2399#else
2400 int ihi = 0, ilo = 1;
2401#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002402 PyObject *errorHandler = NULL;
2403 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002404
2405 /* Note: size will always be longer than the resulting Unicode
2406 character count */
2407 unicode = _PyUnicode_New(size);
2408 if (!unicode)
2409 return NULL;
2410 if (size == 0)
2411 return (PyObject *)unicode;
2412
2413 /* Unpack UTF-16 encoded data */
2414 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002415 q = (unsigned char *)s;
2416 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002417
2418 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002419 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002420
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002421 /* Check for BOM marks (U+FEFF) in the input and adjust current
2422 byte order setting accordingly. In native mode, the leading BOM
2423 mark is skipped, in all other modes, it is copied to the output
2424 stream as-is (giving a ZWNBSP character). */
2425 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002426 if (size >= 2) {
2427 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002428#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002429 if (bom == 0xFEFF) {
2430 q += 2;
2431 bo = -1;
2432 }
2433 else if (bom == 0xFFFE) {
2434 q += 2;
2435 bo = 1;
2436 }
Tim Petersced69f82003-09-16 20:30:58 +00002437#else
Walter Dörwald69652032004-09-07 20:24:22 +00002438 if (bom == 0xFEFF) {
2439 q += 2;
2440 bo = 1;
2441 }
2442 else if (bom == 0xFFFE) {
2443 q += 2;
2444 bo = -1;
2445 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002446#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002447 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002448 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002449
Tim Peters772747b2001-08-09 22:21:55 +00002450 if (bo == -1) {
2451 /* force LE */
2452 ihi = 1;
2453 ilo = 0;
2454 }
2455 else if (bo == 1) {
2456 /* force BE */
2457 ihi = 0;
2458 ilo = 1;
2459 }
2460
2461 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002462 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002463 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002464 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002465 if (consumed)
2466 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002467 errmsg = "truncated data";
2468 startinpos = ((const char *)q)-starts;
2469 endinpos = ((const char *)e)-starts;
2470 goto utf16Error;
2471 /* The remaining input chars are ignored if the callback
2472 chooses to skip the input */
2473 }
2474 ch = (q[ihi] << 8) | q[ilo];
2475
Tim Peters772747b2001-08-09 22:21:55 +00002476 q += 2;
2477
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 if (ch < 0xD800 || ch > 0xDFFF) {
2479 *p++ = ch;
2480 continue;
2481 }
2482
2483 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002484 if (q >= e) {
2485 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486 startinpos = (((const char *)q)-2)-starts;
2487 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002488 goto utf16Error;
2489 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002490 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002491 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2492 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002493 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002494#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002495 *p++ = ch;
2496 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002497#else
2498 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002499#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002500 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002501 }
2502 else {
2503 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002504 startinpos = (((const char *)q)-4)-starts;
2505 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002506 goto utf16Error;
2507 }
2508
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002510 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002511 startinpos = (((const char *)q)-2)-starts;
2512 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002513 /* Fall through to report the error */
2514
2515 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002516 outpos = p-PyUnicode_AS_UNICODE(unicode);
2517 if (unicode_decode_call_errorhandler(
2518 errors, &errorHandler,
2519 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002520 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002521 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002522 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 }
2524
2525 if (byteorder)
2526 *byteorder = bo;
2527
Walter Dörwald69652032004-09-07 20:24:22 +00002528 if (consumed)
2529 *consumed = (const char *)q-starts;
2530
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002532 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533 goto onError;
2534
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002535 Py_XDECREF(errorHandler);
2536 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 return (PyObject *)unicode;
2538
2539onError:
2540 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002541 Py_XDECREF(errorHandler);
2542 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 return NULL;
2544}
2545
Tim Peters772747b2001-08-09 22:21:55 +00002546PyObject *
2547PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002548 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002549 const char *errors,
2550 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551{
2552 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002553 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002554#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002555 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002556#else
2557 const int pairs = 0;
2558#endif
Tim Peters772747b2001-08-09 22:21:55 +00002559 /* Offsets from p for storing byte pairs in the right order. */
2560#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2561 int ihi = 1, ilo = 0;
2562#else
2563 int ihi = 0, ilo = 1;
2564#endif
2565
2566#define STORECHAR(CH) \
2567 do { \
2568 p[ihi] = ((CH) >> 8) & 0xff; \
2569 p[ilo] = (CH) & 0xff; \
2570 p += 2; \
2571 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002573#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002574 for (i = pairs = 0; i < size; i++)
2575 if (s[i] >= 0x10000)
2576 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002577#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002578 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002579 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 if (v == NULL)
2581 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582
Walter Dörwald3cc34522007-05-04 10:48:27 +00002583 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002585 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002586 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002587 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002588
2589 if (byteorder == -1) {
2590 /* force LE */
2591 ihi = 1;
2592 ilo = 0;
2593 }
2594 else if (byteorder == 1) {
2595 /* force BE */
2596 ihi = 0;
2597 ilo = 1;
2598 }
2599
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002600 while (size-- > 0) {
2601 Py_UNICODE ch = *s++;
2602 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002603#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002604 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002605 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2606 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002608#endif
Tim Peters772747b2001-08-09 22:21:55 +00002609 STORECHAR(ch);
2610 if (ch2)
2611 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002614#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615}
2616
2617PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2618{
2619 if (!PyUnicode_Check(unicode)) {
2620 PyErr_BadArgument();
2621 return NULL;
2622 }
2623 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2624 PyUnicode_GET_SIZE(unicode),
2625 NULL,
2626 0);
2627}
2628
2629/* --- Unicode Escape Codec ----------------------------------------------- */
2630
Fredrik Lundh06d12682001-01-24 07:59:11 +00002631static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002632
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002634 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 const char *errors)
2636{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002637 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002638 Py_ssize_t startinpos;
2639 Py_ssize_t endinpos;
2640 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002641 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002643 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002645 char* message;
2646 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002647 PyObject *errorHandler = NULL;
2648 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002649
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650 /* Escaped strings will always be longer than the resulting
2651 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002652 length after conversion to the true value.
2653 (but if the error callback returns a long replacement string
2654 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655 v = _PyUnicode_New(size);
2656 if (v == NULL)
2657 goto onError;
2658 if (size == 0)
2659 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002660
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002661 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002663
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664 while (s < end) {
2665 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002666 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002667 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668
2669 /* Non-escape characters are interpreted as Unicode ordinals */
2670 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002671 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672 continue;
2673 }
2674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002675 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 /* \ - Escapes */
2677 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002678 c = *s++;
2679 if (s > end)
2680 c = '\0'; /* Invalid after \ */
2681 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682
2683 /* \x escapes */
2684 case '\n': break;
2685 case '\\': *p++ = '\\'; break;
2686 case '\'': *p++ = '\''; break;
2687 case '\"': *p++ = '\"'; break;
2688 case 'b': *p++ = '\b'; break;
2689 case 'f': *p++ = '\014'; break; /* FF */
2690 case 't': *p++ = '\t'; break;
2691 case 'n': *p++ = '\n'; break;
2692 case 'r': *p++ = '\r'; break;
2693 case 'v': *p++ = '\013'; break; /* VT */
2694 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2695
2696 /* \OOO (octal) escapes */
2697 case '0': case '1': case '2': case '3':
2698 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002699 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002700 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002701 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002702 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002703 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002705 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 break;
2707
Fredrik Lundhccc74732001-02-18 22:13:49 +00002708 /* hex escapes */
2709 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002711 digits = 2;
2712 message = "truncated \\xXX escape";
2713 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714
Fredrik Lundhccc74732001-02-18 22:13:49 +00002715 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002717 digits = 4;
2718 message = "truncated \\uXXXX escape";
2719 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720
Fredrik Lundhccc74732001-02-18 22:13:49 +00002721 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002722 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002723 digits = 8;
2724 message = "truncated \\UXXXXXXXX escape";
2725 hexescape:
2726 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 outpos = p-PyUnicode_AS_UNICODE(v);
2728 if (s+digits>end) {
2729 endinpos = size;
2730 if (unicode_decode_call_errorhandler(
2731 errors, &errorHandler,
2732 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002733 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 (PyObject **)&v, &outpos, &p))
2735 goto onError;
2736 goto nextByte;
2737 }
2738 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002739 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002740 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 endinpos = (s+i+1)-starts;
2742 if (unicode_decode_call_errorhandler(
2743 errors, &errorHandler,
2744 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002745 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002747 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002749 }
2750 chr = (chr<<4) & ~0xF;
2751 if (c >= '0' && c <= '9')
2752 chr += c - '0';
2753 else if (c >= 'a' && c <= 'f')
2754 chr += 10 + c - 'a';
2755 else
2756 chr += 10 + c - 'A';
2757 }
2758 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002759 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 /* _decoding_error will have already written into the
2761 target buffer. */
2762 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002763 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002764 /* when we get here, chr is a 32-bit unicode character */
2765 if (chr <= 0xffff)
2766 /* UCS-2 character */
2767 *p++ = (Py_UNICODE) chr;
2768 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002769 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002770 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002771#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002772 *p++ = chr;
2773#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002774 chr -= 0x10000L;
2775 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002776 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002777#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002778 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002779 endinpos = s-starts;
2780 outpos = p-PyUnicode_AS_UNICODE(v);
2781 if (unicode_decode_call_errorhandler(
2782 errors, &errorHandler,
2783 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002784 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002786 goto onError;
2787 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002788 break;
2789
2790 /* \N{name} */
2791 case 'N':
2792 message = "malformed \\N character escape";
2793 if (ucnhash_CAPI == NULL) {
2794 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002795 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002796 m = PyImport_ImportModule("unicodedata");
2797 if (m == NULL)
2798 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002799 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002800 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002801 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002802 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002803 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002804 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002805 if (ucnhash_CAPI == NULL)
2806 goto ucnhashError;
2807 }
2808 if (*s == '{') {
2809 const char *start = s+1;
2810 /* look for the closing brace */
2811 while (*s != '}' && s < end)
2812 s++;
2813 if (s > start && s < end && *s == '}') {
2814 /* found a name. look it up in the unicode database */
2815 message = "unknown Unicode character name";
2816 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002817 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002818 goto store;
2819 }
2820 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002821 endinpos = s-starts;
2822 outpos = p-PyUnicode_AS_UNICODE(v);
2823 if (unicode_decode_call_errorhandler(
2824 errors, &errorHandler,
2825 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002826 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002828 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002829 break;
2830
2831 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002832 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833 message = "\\ at end of string";
2834 s--;
2835 endinpos = s-starts;
2836 outpos = p-PyUnicode_AS_UNICODE(v);
2837 if (unicode_decode_call_errorhandler(
2838 errors, &errorHandler,
2839 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002840 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002841 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002842 goto onError;
2843 }
2844 else {
2845 *p++ = '\\';
2846 *p++ = (unsigned char)s[-1];
2847 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002848 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002850 nextByte:
2851 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002853 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002854 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002855 Py_XDECREF(errorHandler);
2856 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002858
Fredrik Lundhccc74732001-02-18 22:13:49 +00002859ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002860 PyErr_SetString(
2861 PyExc_UnicodeError,
2862 "\\N escapes not supported (can't load unicodedata module)"
2863 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002864 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002865 Py_XDECREF(errorHandler);
2866 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002867 return NULL;
2868
Fredrik Lundhccc74732001-02-18 22:13:49 +00002869onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002871 Py_XDECREF(errorHandler);
2872 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873 return NULL;
2874}
2875
2876/* Return a Unicode-Escape string version of the Unicode object.
2877
2878 If quotes is true, the string is enclosed in u"" or u'' quotes as
2879 appropriate.
2880
2881*/
2882
Thomas Wouters477c8d52006-05-27 19:21:47 +00002883Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2884 Py_ssize_t size,
2885 Py_UNICODE ch)
2886{
2887 /* like wcschr, but doesn't stop at NULL characters */
2888
2889 while (size-- > 0) {
2890 if (*s == ch)
2891 return s;
2892 s++;
2893 }
2894
2895 return NULL;
2896}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002897
Walter Dörwald79e913e2007-05-12 11:08:06 +00002898static const char *hexdigits = "0123456789abcdef";
2899
2900PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2901 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902{
2903 PyObject *repr;
2904 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905
Thomas Wouters89f507f2006-12-13 04:49:30 +00002906 /* XXX(nnorwitz): rather than over-allocating, it would be
2907 better to choose a different scheme. Perhaps scan the
2908 first N-chars of the string and allocate based on that size.
2909 */
2910 /* Initial allocation is based on the longest-possible unichr
2911 escape.
2912
2913 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2914 unichr, so in this case it's the longest unichr escape. In
2915 narrow (UTF-16) builds this is five chars per source unichr
2916 since there are two unichrs in the surrogate pair, so in narrow
2917 (UTF-16) builds it's not the longest unichr escape.
2918
2919 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2920 so in the narrow (UTF-16) build case it's the longest unichr
2921 escape.
2922 */
2923
Walter Dörwald79e913e2007-05-12 11:08:06 +00002924 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002925#ifdef Py_UNICODE_WIDE
2926 + 10*size
2927#else
2928 + 6*size
2929#endif
2930 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002931 if (repr == NULL)
2932 return NULL;
2933
Walter Dörwald79e913e2007-05-12 11:08:06 +00002934 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 while (size-- > 0) {
2937 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002938
Walter Dörwald79e913e2007-05-12 11:08:06 +00002939 /* Escape backslashes */
2940 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002941 *p++ = '\\';
2942 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002943 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002944 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002945
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002946#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002947 /* Map 21-bit characters to '\U00xxxxxx' */
2948 else if (ch >= 0x10000) {
2949 *p++ = '\\';
2950 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002951 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2952 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2953 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2954 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2955 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2956 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2957 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2958 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002959 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002960 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002961#else
2962 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002963 else if (ch >= 0xD800 && ch < 0xDC00) {
2964 Py_UNICODE ch2;
2965 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002966
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002967 ch2 = *s++;
2968 size--;
2969 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2970 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2971 *p++ = '\\';
2972 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002973 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2974 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2975 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2976 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2977 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2978 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2979 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2980 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002981 continue;
2982 }
2983 /* Fall through: isolated surrogates are copied as-is */
2984 s--;
2985 size++;
2986 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002987#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002988
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002990 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991 *p++ = '\\';
2992 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002993 *p++ = hexdigits[(ch >> 12) & 0x000F];
2994 *p++ = hexdigits[(ch >> 8) & 0x000F];
2995 *p++ = hexdigits[(ch >> 4) & 0x000F];
2996 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002998
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002999 /* Map special whitespace to '\t', \n', '\r' */
3000 else if (ch == '\t') {
3001 *p++ = '\\';
3002 *p++ = 't';
3003 }
3004 else if (ch == '\n') {
3005 *p++ = '\\';
3006 *p++ = 'n';
3007 }
3008 else if (ch == '\r') {
3009 *p++ = '\\';
3010 *p++ = 'r';
3011 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003012
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003013 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003014 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003016 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003017 *p++ = hexdigits[(ch >> 4) & 0x000F];
3018 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003019 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003020
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021 /* Copy everything else as-is */
3022 else
3023 *p++ = (char) ch;
3024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025
3026 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003027 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
3028 Py_DECREF(repr);
3029 return NULL;
3030 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 return repr;
3032}
3033
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3035{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003036 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 if (!PyUnicode_Check(unicode)) {
3038 PyErr_BadArgument();
3039 return NULL;
3040 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003041 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3042 PyUnicode_GET_SIZE(unicode));
3043
3044 if (!s)
3045 return NULL;
3046 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3047 PyBytes_GET_SIZE(s));
3048 Py_DECREF(s);
3049 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050}
3051
3052/* --- Raw Unicode Escape Codec ------------------------------------------- */
3053
3054PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003055 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 const char *errors)
3057{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003058 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003059 Py_ssize_t startinpos;
3060 Py_ssize_t endinpos;
3061 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003063 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 const char *end;
3065 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 PyObject *errorHandler = NULL;
3067 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003068
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 /* Escaped strings will always be longer than the resulting
3070 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003071 length after conversion to the true value. (But decoding error
3072 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 v = _PyUnicode_New(size);
3074 if (v == NULL)
3075 goto onError;
3076 if (size == 0)
3077 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 end = s + size;
3080 while (s < end) {
3081 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003082 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003084 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085
3086 /* Non-escape characters are interpreted as Unicode ordinals */
3087 if (*s != '\\') {
3088 *p++ = (unsigned char)*s++;
3089 continue;
3090 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092
3093 /* \u-escapes are only interpreted iff the number of leading
3094 backslashes if odd */
3095 bs = s;
3096 for (;s < end;) {
3097 if (*s != '\\')
3098 break;
3099 *p++ = (unsigned char)*s++;
3100 }
3101 if (((s - bs) & 1) == 0 ||
3102 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003103 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 continue;
3105 }
3106 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003107 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108 s++;
3109
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003110 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003111 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003112 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003113 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003114 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003115 endinpos = s-starts;
3116 if (unicode_decode_call_errorhandler(
3117 errors, &errorHandler,
3118 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003119 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003122 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 }
3124 x = (x<<4) & ~0xF;
3125 if (c >= '0' && c <= '9')
3126 x += c - '0';
3127 else if (c >= 'a' && c <= 'f')
3128 x += 10 + c - 'a';
3129 else
3130 x += 10 + c - 'A';
3131 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003132#ifndef Py_UNICODE_WIDE
3133 if (x > 0x10000) {
3134 if (unicode_decode_call_errorhandler(
3135 errors, &errorHandler,
3136 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003137 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003138 (PyObject **)&v, &outpos, &p))
3139 goto onError;
3140 }
3141#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003142 *p++ = x;
3143 nextByte:
3144 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003146 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003147 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003148 Py_XDECREF(errorHandler);
3149 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003151
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 onError:
3153 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003154 Py_XDECREF(errorHandler);
3155 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156 return NULL;
3157}
3158
3159PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003160 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161{
3162 PyObject *repr;
3163 char *p;
3164 char *q;
3165
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003166#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003167 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003168#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003169 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003170#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 if (repr == NULL)
3172 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003173 if (size == 0)
3174 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175
Walter Dörwald711005d2007-05-12 12:03:26 +00003176 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177 while (size-- > 0) {
3178 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003179#ifdef Py_UNICODE_WIDE
3180 /* Map 32-bit characters to '\Uxxxxxxxx' */
3181 if (ch >= 0x10000) {
3182 *p++ = '\\';
3183 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003184 *p++ = hexdigits[(ch >> 28) & 0xf];
3185 *p++ = hexdigits[(ch >> 24) & 0xf];
3186 *p++ = hexdigits[(ch >> 20) & 0xf];
3187 *p++ = hexdigits[(ch >> 16) & 0xf];
3188 *p++ = hexdigits[(ch >> 12) & 0xf];
3189 *p++ = hexdigits[(ch >> 8) & 0xf];
3190 *p++ = hexdigits[(ch >> 4) & 0xf];
3191 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003192 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003193 else
3194#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 /* Map 16-bit characters to '\uxxxx' */
3196 if (ch >= 256) {
3197 *p++ = '\\';
3198 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003199 *p++ = hexdigits[(ch >> 12) & 0xf];
3200 *p++ = hexdigits[(ch >> 8) & 0xf];
3201 *p++ = hexdigits[(ch >> 4) & 0xf];
3202 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203 }
3204 /* Copy everything else as-is */
3205 else
3206 *p++ = (char) ch;
3207 }
3208 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003209 if (PyBytes_Resize(repr, p - q)) {
3210 Py_DECREF(repr);
3211 return NULL;
3212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 return repr;
3214}
3215
3216PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3217{
Walter Dörwald711005d2007-05-12 12:03:26 +00003218 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003220 PyErr_BadArgument();
3221 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003223 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3224 PyUnicode_GET_SIZE(unicode));
3225
3226 if (!s)
3227 return NULL;
3228 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3229 PyBytes_GET_SIZE(s));
3230 Py_DECREF(s);
3231 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232}
3233
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003234/* --- Unicode Internal Codec ------------------------------------------- */
3235
3236PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003237 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003238 const char *errors)
3239{
3240 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003241 Py_ssize_t startinpos;
3242 Py_ssize_t endinpos;
3243 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003244 PyUnicodeObject *v;
3245 Py_UNICODE *p;
3246 const char *end;
3247 const char *reason;
3248 PyObject *errorHandler = NULL;
3249 PyObject *exc = NULL;
3250
Neal Norwitzd43069c2006-01-08 01:12:10 +00003251#ifdef Py_UNICODE_WIDE
3252 Py_UNICODE unimax = PyUnicode_GetMax();
3253#endif
3254
Thomas Wouters89f507f2006-12-13 04:49:30 +00003255 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003256 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3257 if (v == NULL)
3258 goto onError;
3259 if (PyUnicode_GetSize((PyObject *)v) == 0)
3260 return (PyObject *)v;
3261 p = PyUnicode_AS_UNICODE(v);
3262 end = s + size;
3263
3264 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003265 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003266 /* We have to sanity check the raw data, otherwise doom looms for
3267 some malformed UCS-4 data. */
3268 if (
3269 #ifdef Py_UNICODE_WIDE
3270 *p > unimax || *p < 0 ||
3271 #endif
3272 end-s < Py_UNICODE_SIZE
3273 )
3274 {
3275 startinpos = s - starts;
3276 if (end-s < Py_UNICODE_SIZE) {
3277 endinpos = end-starts;
3278 reason = "truncated input";
3279 }
3280 else {
3281 endinpos = s - starts + Py_UNICODE_SIZE;
3282 reason = "illegal code point (> 0x10FFFF)";
3283 }
3284 outpos = p - PyUnicode_AS_UNICODE(v);
3285 if (unicode_decode_call_errorhandler(
3286 errors, &errorHandler,
3287 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003288 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003289 (PyObject **)&v, &outpos, &p)) {
3290 goto onError;
3291 }
3292 }
3293 else {
3294 p++;
3295 s += Py_UNICODE_SIZE;
3296 }
3297 }
3298
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003299 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003300 goto onError;
3301 Py_XDECREF(errorHandler);
3302 Py_XDECREF(exc);
3303 return (PyObject *)v;
3304
3305 onError:
3306 Py_XDECREF(v);
3307 Py_XDECREF(errorHandler);
3308 Py_XDECREF(exc);
3309 return NULL;
3310}
3311
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312/* --- Latin-1 Codec ------------------------------------------------------ */
3313
3314PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003315 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 const char *errors)
3317{
3318 PyUnicodeObject *v;
3319 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003320
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003322 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003323 Py_UNICODE r = *(unsigned char*)s;
3324 return PyUnicode_FromUnicode(&r, 1);
3325 }
3326
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 v = _PyUnicode_New(size);
3328 if (v == NULL)
3329 goto onError;
3330 if (size == 0)
3331 return (PyObject *)v;
3332 p = PyUnicode_AS_UNICODE(v);
3333 while (size-- > 0)
3334 *p++ = (unsigned char)*s++;
3335 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003336
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 onError:
3338 Py_XDECREF(v);
3339 return NULL;
3340}
3341
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342/* create or adjust a UnicodeEncodeError */
3343static void make_encode_exception(PyObject **exceptionObject,
3344 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003345 const Py_UNICODE *unicode, Py_ssize_t size,
3346 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003347 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349 if (*exceptionObject == NULL) {
3350 *exceptionObject = PyUnicodeEncodeError_Create(
3351 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 }
3353 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003354 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3355 goto onError;
3356 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3357 goto onError;
3358 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3359 goto onError;
3360 return;
3361 onError:
3362 Py_DECREF(*exceptionObject);
3363 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 }
3365}
3366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003367/* raises a UnicodeEncodeError */
3368static void raise_encode_exception(PyObject **exceptionObject,
3369 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003370 const Py_UNICODE *unicode, Py_ssize_t size,
3371 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372 const char *reason)
3373{
3374 make_encode_exception(exceptionObject,
3375 encoding, unicode, size, startpos, endpos, reason);
3376 if (*exceptionObject != NULL)
3377 PyCodec_StrictErrors(*exceptionObject);
3378}
3379
3380/* error handling callback helper:
3381 build arguments, call the callback and check the arguments,
3382 put the result into newpos and return the replacement string, which
3383 has to be freed by the caller */
3384static PyObject *unicode_encode_call_errorhandler(const char *errors,
3385 PyObject **errorHandler,
3386 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003387 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3388 Py_ssize_t startpos, Py_ssize_t endpos,
3389 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003391 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003392
3393 PyObject *restuple;
3394 PyObject *resunicode;
3395
3396 if (*errorHandler == NULL) {
3397 *errorHandler = PyCodec_LookupError(errors);
3398 if (*errorHandler == NULL)
3399 return NULL;
3400 }
3401
3402 make_encode_exception(exceptionObject,
3403 encoding, unicode, size, startpos, endpos, reason);
3404 if (*exceptionObject == NULL)
3405 return NULL;
3406
3407 restuple = PyObject_CallFunctionObjArgs(
3408 *errorHandler, *exceptionObject, NULL);
3409 if (restuple == NULL)
3410 return NULL;
3411 if (!PyTuple_Check(restuple)) {
3412 PyErr_Format(PyExc_TypeError, &argparse[4]);
3413 Py_DECREF(restuple);
3414 return NULL;
3415 }
3416 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3417 &resunicode, newpos)) {
3418 Py_DECREF(restuple);
3419 return NULL;
3420 }
3421 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003422 *newpos = size+*newpos;
3423 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003424 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003425 Py_DECREF(restuple);
3426 return NULL;
3427 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 Py_INCREF(resunicode);
3429 Py_DECREF(restuple);
3430 return resunicode;
3431}
3432
3433static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003434 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 const char *errors,
3436 int limit)
3437{
3438 /* output object */
3439 PyObject *res;
3440 /* pointers to the beginning and end+1 of input */
3441 const Py_UNICODE *startp = p;
3442 const Py_UNICODE *endp = p + size;
3443 /* pointer to the beginning of the unencodable characters */
3444 /* const Py_UNICODE *badp = NULL; */
3445 /* pointer into the output */
3446 char *str;
3447 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003448 Py_ssize_t respos = 0;
3449 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003450 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3451 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003452 PyObject *errorHandler = NULL;
3453 PyObject *exc = NULL;
3454 /* the following variable is used for caching string comparisons
3455 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3456 int known_errorHandler = -1;
3457
3458 /* allocate enough for a simple encoding without
3459 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003460 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461 if (res == NULL)
3462 goto onError;
3463 if (size == 0)
3464 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003465 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 ressize = size;
3467
3468 while (p<endp) {
3469 Py_UNICODE c = *p;
3470
3471 /* can we encode this? */
3472 if (c<limit) {
3473 /* no overflow check, because we know that the space is enough */
3474 *str++ = (char)c;
3475 ++p;
3476 }
3477 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003478 Py_ssize_t unicodepos = p-startp;
3479 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003480 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003481 Py_ssize_t repsize;
3482 Py_ssize_t newpos;
3483 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484 Py_UNICODE *uni2;
3485 /* startpos for collecting unencodable chars */
3486 const Py_UNICODE *collstart = p;
3487 const Py_UNICODE *collend = p;
3488 /* find all unecodable characters */
3489 while ((collend < endp) && ((*collend)>=limit))
3490 ++collend;
3491 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3492 if (known_errorHandler==-1) {
3493 if ((errors==NULL) || (!strcmp(errors, "strict")))
3494 known_errorHandler = 1;
3495 else if (!strcmp(errors, "replace"))
3496 known_errorHandler = 2;
3497 else if (!strcmp(errors, "ignore"))
3498 known_errorHandler = 3;
3499 else if (!strcmp(errors, "xmlcharrefreplace"))
3500 known_errorHandler = 4;
3501 else
3502 known_errorHandler = 0;
3503 }
3504 switch (known_errorHandler) {
3505 case 1: /* strict */
3506 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3507 goto onError;
3508 case 2: /* replace */
3509 while (collstart++<collend)
3510 *str++ = '?'; /* fall through */
3511 case 3: /* ignore */
3512 p = collend;
3513 break;
3514 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003515 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003516 /* determine replacement size (temporarily (mis)uses p) */
3517 for (p = collstart, repsize = 0; p < collend; ++p) {
3518 if (*p<10)
3519 repsize += 2+1+1;
3520 else if (*p<100)
3521 repsize += 2+2+1;
3522 else if (*p<1000)
3523 repsize += 2+3+1;
3524 else if (*p<10000)
3525 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003526#ifndef Py_UNICODE_WIDE
3527 else
3528 repsize += 2+5+1;
3529#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 else if (*p<100000)
3531 repsize += 2+5+1;
3532 else if (*p<1000000)
3533 repsize += 2+6+1;
3534 else
3535 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003536#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537 }
3538 requiredsize = respos+repsize+(endp-collend);
3539 if (requiredsize > ressize) {
3540 if (requiredsize<2*ressize)
3541 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003542 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003544 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 ressize = requiredsize;
3546 }
3547 /* generate replacement (temporarily (mis)uses p) */
3548 for (p = collstart; p < collend; ++p) {
3549 str += sprintf(str, "&#%d;", (int)*p);
3550 }
3551 p = collend;
3552 break;
3553 default:
3554 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3555 encoding, reason, startp, size, &exc,
3556 collstart-startp, collend-startp, &newpos);
3557 if (repunicode == NULL)
3558 goto onError;
3559 /* need more space? (at least enough for what we
3560 have+the replacement+the rest of the string, so
3561 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003562 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 repsize = PyUnicode_GET_SIZE(repunicode);
3564 requiredsize = respos+repsize+(endp-collend);
3565 if (requiredsize > ressize) {
3566 if (requiredsize<2*ressize)
3567 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003568 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 Py_DECREF(repunicode);
3570 goto onError;
3571 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003572 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 ressize = requiredsize;
3574 }
3575 /* check if there is anything unencodable in the replacement
3576 and copy it to the output */
3577 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3578 c = *uni2;
3579 if (c >= limit) {
3580 raise_encode_exception(&exc, encoding, startp, size,
3581 unicodepos, unicodepos+1, reason);
3582 Py_DECREF(repunicode);
3583 goto onError;
3584 }
3585 *str = (char)c;
3586 }
3587 p = startp + newpos;
3588 Py_DECREF(repunicode);
3589 }
3590 }
3591 }
3592 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003593 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 if (respos<ressize)
3595 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003596 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 Py_XDECREF(errorHandler);
3598 Py_XDECREF(exc);
3599 return res;
3600
3601 onError:
3602 Py_XDECREF(res);
3603 Py_XDECREF(errorHandler);
3604 Py_XDECREF(exc);
3605 return NULL;
3606}
3607
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003609 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 const char *errors)
3611{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613}
3614
3615PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3616{
3617 if (!PyUnicode_Check(unicode)) {
3618 PyErr_BadArgument();
3619 return NULL;
3620 }
3621 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3622 PyUnicode_GET_SIZE(unicode),
3623 NULL);
3624}
3625
3626/* --- 7-bit ASCII Codec -------------------------------------------------- */
3627
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003629 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 const char *errors)
3631{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633 PyUnicodeObject *v;
3634 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003635 Py_ssize_t startinpos;
3636 Py_ssize_t endinpos;
3637 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 const char *e;
3639 PyObject *errorHandler = NULL;
3640 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003641
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003643 if (size == 1 && *(unsigned char*)s < 128) {
3644 Py_UNICODE r = *(unsigned char*)s;
3645 return PyUnicode_FromUnicode(&r, 1);
3646 }
Tim Petersced69f82003-09-16 20:30:58 +00003647
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648 v = _PyUnicode_New(size);
3649 if (v == NULL)
3650 goto onError;
3651 if (size == 0)
3652 return (PyObject *)v;
3653 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 e = s + size;
3655 while (s < e) {
3656 register unsigned char c = (unsigned char)*s;
3657 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 ++s;
3660 }
3661 else {
3662 startinpos = s-starts;
3663 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003664 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 if (unicode_decode_call_errorhandler(
3666 errors, &errorHandler,
3667 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003668 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003671 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003673 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003674 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003675 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 Py_XDECREF(errorHandler);
3677 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003679
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 onError:
3681 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 Py_XDECREF(errorHandler);
3683 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 return NULL;
3685}
3686
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003688 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 const char *errors)
3690{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692}
3693
3694PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3695{
3696 if (!PyUnicode_Check(unicode)) {
3697 PyErr_BadArgument();
3698 return NULL;
3699 }
3700 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3701 PyUnicode_GET_SIZE(unicode),
3702 NULL);
3703}
3704
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003705#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003706
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003707/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003708
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003709#if SIZEOF_INT < SIZEOF_SSIZE_T
3710#define NEED_RETRY
3711#endif
3712
3713/* XXX This code is limited to "true" double-byte encodings, as
3714 a) it assumes an incomplete character consists of a single byte, and
3715 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3716 encodings, see IsDBCSLeadByteEx documentation. */
3717
3718static int is_dbcs_lead_byte(const char *s, int offset)
3719{
3720 const char *curr = s + offset;
3721
3722 if (IsDBCSLeadByte(*curr)) {
3723 const char *prev = CharPrev(s, curr);
3724 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3725 }
3726 return 0;
3727}
3728
3729/*
3730 * Decode MBCS string into unicode object. If 'final' is set, converts
3731 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3732 */
3733static int decode_mbcs(PyUnicodeObject **v,
3734 const char *s, /* MBCS string */
3735 int size, /* sizeof MBCS string */
3736 int final)
3737{
3738 Py_UNICODE *p;
3739 Py_ssize_t n = 0;
3740 int usize = 0;
3741
3742 assert(size >= 0);
3743
3744 /* Skip trailing lead-byte unless 'final' is set */
3745 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3746 --size;
3747
3748 /* First get the size of the result */
3749 if (size > 0) {
3750 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3751 if (usize == 0) {
3752 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3753 return -1;
3754 }
3755 }
3756
3757 if (*v == NULL) {
3758 /* Create unicode object */
3759 *v = _PyUnicode_New(usize);
3760 if (*v == NULL)
3761 return -1;
3762 }
3763 else {
3764 /* Extend unicode object */
3765 n = PyUnicode_GET_SIZE(*v);
3766 if (_PyUnicode_Resize(v, n + usize) < 0)
3767 return -1;
3768 }
3769
3770 /* Do the conversion */
3771 if (size > 0) {
3772 p = PyUnicode_AS_UNICODE(*v) + n;
3773 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3774 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3775 return -1;
3776 }
3777 }
3778
3779 return size;
3780}
3781
3782PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3783 Py_ssize_t size,
3784 const char *errors,
3785 Py_ssize_t *consumed)
3786{
3787 PyUnicodeObject *v = NULL;
3788 int done;
3789
3790 if (consumed)
3791 *consumed = 0;
3792
3793#ifdef NEED_RETRY
3794 retry:
3795 if (size > INT_MAX)
3796 done = decode_mbcs(&v, s, INT_MAX, 0);
3797 else
3798#endif
3799 done = decode_mbcs(&v, s, (int)size, !consumed);
3800
3801 if (done < 0) {
3802 Py_XDECREF(v);
3803 return NULL;
3804 }
3805
3806 if (consumed)
3807 *consumed += done;
3808
3809#ifdef NEED_RETRY
3810 if (size > INT_MAX) {
3811 s += done;
3812 size -= done;
3813 goto retry;
3814 }
3815#endif
3816
3817 return (PyObject *)v;
3818}
3819
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003820PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003821 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003822 const char *errors)
3823{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003824 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3825}
3826
3827/*
3828 * Convert unicode into string object (MBCS).
3829 * Returns 0 if succeed, -1 otherwise.
3830 */
3831static int encode_mbcs(PyObject **repr,
3832 const Py_UNICODE *p, /* unicode */
3833 int size) /* size of unicode */
3834{
3835 int mbcssize = 0;
3836 Py_ssize_t n = 0;
3837
3838 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003839
3840 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003841 if (size > 0) {
3842 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3843 if (mbcssize == 0) {
3844 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3845 return -1;
3846 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003847 }
3848
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003849 if (*repr == NULL) {
3850 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003851 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003852 if (*repr == NULL)
3853 return -1;
3854 }
3855 else {
3856 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003857 n = PyBytes_Size(*repr);
3858 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003859 return -1;
3860 }
3861
3862 /* Do the conversion */
3863 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003864 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003865 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3866 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3867 return -1;
3868 }
3869 }
3870
3871 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003872}
3873
3874PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003875 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003876 const char *errors)
3877{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003878 PyObject *repr = NULL;
3879 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003880
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003881#ifdef NEED_RETRY
3882 retry:
3883 if (size > INT_MAX)
3884 ret = encode_mbcs(&repr, p, INT_MAX);
3885 else
3886#endif
3887 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003888
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003889 if (ret < 0) {
3890 Py_XDECREF(repr);
3891 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003892 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003893
3894#ifdef NEED_RETRY
3895 if (size > INT_MAX) {
3896 p += INT_MAX;
3897 size -= INT_MAX;
3898 goto retry;
3899 }
3900#endif
3901
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003902 return repr;
3903}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003904
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003905PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3906{
3907 if (!PyUnicode_Check(unicode)) {
3908 PyErr_BadArgument();
3909 return NULL;
3910 }
3911 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3912 PyUnicode_GET_SIZE(unicode),
3913 NULL);
3914}
3915
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003916#undef NEED_RETRY
3917
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003918#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003919
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920/* --- Character Mapping Codec -------------------------------------------- */
3921
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003923 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924 PyObject *mapping,
3925 const char *errors)
3926{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003928 Py_ssize_t startinpos;
3929 Py_ssize_t endinpos;
3930 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003932 PyUnicodeObject *v;
3933 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003934 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 PyObject *errorHandler = NULL;
3936 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003937 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003938 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003939
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 /* Default to Latin-1 */
3941 if (mapping == NULL)
3942 return PyUnicode_DecodeLatin1(s, size, errors);
3943
3944 v = _PyUnicode_New(size);
3945 if (v == NULL)
3946 goto onError;
3947 if (size == 0)
3948 return (PyObject *)v;
3949 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003950 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003951 if (PyUnicode_CheckExact(mapping)) {
3952 mapstring = PyUnicode_AS_UNICODE(mapping);
3953 maplen = PyUnicode_GET_SIZE(mapping);
3954 while (s < e) {
3955 unsigned char ch = *s;
3956 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003958 if (ch < maplen)
3959 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003961 if (x == 0xfffe) {
3962 /* undefined mapping */
3963 outpos = p-PyUnicode_AS_UNICODE(v);
3964 startinpos = s-starts;
3965 endinpos = startinpos+1;
3966 if (unicode_decode_call_errorhandler(
3967 errors, &errorHandler,
3968 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003969 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003970 (PyObject **)&v, &outpos, &p)) {
3971 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003972 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003973 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003974 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003975 *p++ = x;
3976 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003978 }
3979 else {
3980 while (s < e) {
3981 unsigned char ch = *s;
3982 PyObject *w, *x;
3983
3984 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3985 w = PyInt_FromLong((long)ch);
3986 if (w == NULL)
3987 goto onError;
3988 x = PyObject_GetItem(mapping, w);
3989 Py_DECREF(w);
3990 if (x == NULL) {
3991 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3992 /* No mapping found means: mapping is undefined. */
3993 PyErr_Clear();
3994 x = Py_None;
3995 Py_INCREF(x);
3996 } else
3997 goto onError;
3998 }
3999
4000 /* Apply mapping */
4001 if (PyInt_Check(x)) {
4002 long value = PyInt_AS_LONG(x);
4003 if (value < 0 || value > 65535) {
4004 PyErr_SetString(PyExc_TypeError,
4005 "character mapping must be in range(65536)");
4006 Py_DECREF(x);
4007 goto onError;
4008 }
4009 *p++ = (Py_UNICODE)value;
4010 }
4011 else if (x == Py_None) {
4012 /* undefined mapping */
4013 outpos = p-PyUnicode_AS_UNICODE(v);
4014 startinpos = s-starts;
4015 endinpos = startinpos+1;
4016 if (unicode_decode_call_errorhandler(
4017 errors, &errorHandler,
4018 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004019 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004020 (PyObject **)&v, &outpos, &p)) {
4021 Py_DECREF(x);
4022 goto onError;
4023 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004024 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004025 continue;
4026 }
4027 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004028 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004029
4030 if (targetsize == 1)
4031 /* 1-1 mapping */
4032 *p++ = *PyUnicode_AS_UNICODE(x);
4033
4034 else if (targetsize > 1) {
4035 /* 1-n mapping */
4036 if (targetsize > extrachars) {
4037 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004038 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4039 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004040 (targetsize << 2);
4041 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004042 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004043 if (_PyUnicode_Resize(&v,
4044 PyUnicode_GET_SIZE(v) + needed) < 0) {
4045 Py_DECREF(x);
4046 goto onError;
4047 }
4048 p = PyUnicode_AS_UNICODE(v) + oldpos;
4049 }
4050 Py_UNICODE_COPY(p,
4051 PyUnicode_AS_UNICODE(x),
4052 targetsize);
4053 p += targetsize;
4054 extrachars -= targetsize;
4055 }
4056 /* 1-0 mapping: skip the character */
4057 }
4058 else {
4059 /* wrong return value */
4060 PyErr_SetString(PyExc_TypeError,
4061 "character mapping must return integer, None or unicode");
4062 Py_DECREF(x);
4063 goto onError;
4064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004066 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 }
4069 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004070 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072 Py_XDECREF(errorHandler);
4073 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004075
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077 Py_XDECREF(errorHandler);
4078 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079 Py_XDECREF(v);
4080 return NULL;
4081}
4082
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004083/* Charmap encoding: the lookup table */
4084
4085struct encoding_map{
4086 PyObject_HEAD
4087 unsigned char level1[32];
4088 int count2, count3;
4089 unsigned char level23[1];
4090};
4091
4092static PyObject*
4093encoding_map_size(PyObject *obj, PyObject* args)
4094{
4095 struct encoding_map *map = (struct encoding_map*)obj;
4096 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4097 128*map->count3);
4098}
4099
4100static PyMethodDef encoding_map_methods[] = {
4101 {"size", encoding_map_size, METH_NOARGS,
4102 PyDoc_STR("Return the size (in bytes) of this object") },
4103 { 0 }
4104};
4105
4106static void
4107encoding_map_dealloc(PyObject* o)
4108{
4109 PyObject_FREE(o);
4110}
4111
4112static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004113 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004114 "EncodingMap", /*tp_name*/
4115 sizeof(struct encoding_map), /*tp_basicsize*/
4116 0, /*tp_itemsize*/
4117 /* methods */
4118 encoding_map_dealloc, /*tp_dealloc*/
4119 0, /*tp_print*/
4120 0, /*tp_getattr*/
4121 0, /*tp_setattr*/
4122 0, /*tp_compare*/
4123 0, /*tp_repr*/
4124 0, /*tp_as_number*/
4125 0, /*tp_as_sequence*/
4126 0, /*tp_as_mapping*/
4127 0, /*tp_hash*/
4128 0, /*tp_call*/
4129 0, /*tp_str*/
4130 0, /*tp_getattro*/
4131 0, /*tp_setattro*/
4132 0, /*tp_as_buffer*/
4133 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4134 0, /*tp_doc*/
4135 0, /*tp_traverse*/
4136 0, /*tp_clear*/
4137 0, /*tp_richcompare*/
4138 0, /*tp_weaklistoffset*/
4139 0, /*tp_iter*/
4140 0, /*tp_iternext*/
4141 encoding_map_methods, /*tp_methods*/
4142 0, /*tp_members*/
4143 0, /*tp_getset*/
4144 0, /*tp_base*/
4145 0, /*tp_dict*/
4146 0, /*tp_descr_get*/
4147 0, /*tp_descr_set*/
4148 0, /*tp_dictoffset*/
4149 0, /*tp_init*/
4150 0, /*tp_alloc*/
4151 0, /*tp_new*/
4152 0, /*tp_free*/
4153 0, /*tp_is_gc*/
4154};
4155
4156PyObject*
4157PyUnicode_BuildEncodingMap(PyObject* string)
4158{
4159 Py_UNICODE *decode;
4160 PyObject *result;
4161 struct encoding_map *mresult;
4162 int i;
4163 int need_dict = 0;
4164 unsigned char level1[32];
4165 unsigned char level2[512];
4166 unsigned char *mlevel1, *mlevel2, *mlevel3;
4167 int count2 = 0, count3 = 0;
4168
4169 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4170 PyErr_BadArgument();
4171 return NULL;
4172 }
4173 decode = PyUnicode_AS_UNICODE(string);
4174 memset(level1, 0xFF, sizeof level1);
4175 memset(level2, 0xFF, sizeof level2);
4176
4177 /* If there isn't a one-to-one mapping of NULL to \0,
4178 or if there are non-BMP characters, we need to use
4179 a mapping dictionary. */
4180 if (decode[0] != 0)
4181 need_dict = 1;
4182 for (i = 1; i < 256; i++) {
4183 int l1, l2;
4184 if (decode[i] == 0
4185 #ifdef Py_UNICODE_WIDE
4186 || decode[i] > 0xFFFF
4187 #endif
4188 ) {
4189 need_dict = 1;
4190 break;
4191 }
4192 if (decode[i] == 0xFFFE)
4193 /* unmapped character */
4194 continue;
4195 l1 = decode[i] >> 11;
4196 l2 = decode[i] >> 7;
4197 if (level1[l1] == 0xFF)
4198 level1[l1] = count2++;
4199 if (level2[l2] == 0xFF)
4200 level2[l2] = count3++;
4201 }
4202
4203 if (count2 >= 0xFF || count3 >= 0xFF)
4204 need_dict = 1;
4205
4206 if (need_dict) {
4207 PyObject *result = PyDict_New();
4208 PyObject *key, *value;
4209 if (!result)
4210 return NULL;
4211 for (i = 0; i < 256; i++) {
4212 key = value = NULL;
4213 key = PyInt_FromLong(decode[i]);
4214 value = PyInt_FromLong(i);
4215 if (!key || !value)
4216 goto failed1;
4217 if (PyDict_SetItem(result, key, value) == -1)
4218 goto failed1;
4219 Py_DECREF(key);
4220 Py_DECREF(value);
4221 }
4222 return result;
4223 failed1:
4224 Py_XDECREF(key);
4225 Py_XDECREF(value);
4226 Py_DECREF(result);
4227 return NULL;
4228 }
4229
4230 /* Create a three-level trie */
4231 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4232 16*count2 + 128*count3 - 1);
4233 if (!result)
4234 return PyErr_NoMemory();
4235 PyObject_Init(result, &EncodingMapType);
4236 mresult = (struct encoding_map*)result;
4237 mresult->count2 = count2;
4238 mresult->count3 = count3;
4239 mlevel1 = mresult->level1;
4240 mlevel2 = mresult->level23;
4241 mlevel3 = mresult->level23 + 16*count2;
4242 memcpy(mlevel1, level1, 32);
4243 memset(mlevel2, 0xFF, 16*count2);
4244 memset(mlevel3, 0, 128*count3);
4245 count3 = 0;
4246 for (i = 1; i < 256; i++) {
4247 int o1, o2, o3, i2, i3;
4248 if (decode[i] == 0xFFFE)
4249 /* unmapped character */
4250 continue;
4251 o1 = decode[i]>>11;
4252 o2 = (decode[i]>>7) & 0xF;
4253 i2 = 16*mlevel1[o1] + o2;
4254 if (mlevel2[i2] == 0xFF)
4255 mlevel2[i2] = count3++;
4256 o3 = decode[i] & 0x7F;
4257 i3 = 128*mlevel2[i2] + o3;
4258 mlevel3[i3] = i;
4259 }
4260 return result;
4261}
4262
4263static int
4264encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4265{
4266 struct encoding_map *map = (struct encoding_map*)mapping;
4267 int l1 = c>>11;
4268 int l2 = (c>>7) & 0xF;
4269 int l3 = c & 0x7F;
4270 int i;
4271
4272#ifdef Py_UNICODE_WIDE
4273 if (c > 0xFFFF) {
4274 return -1;
4275 }
4276#endif
4277 if (c == 0)
4278 return 0;
4279 /* level 1*/
4280 i = map->level1[l1];
4281 if (i == 0xFF) {
4282 return -1;
4283 }
4284 /* level 2*/
4285 i = map->level23[16*i+l2];
4286 if (i == 0xFF) {
4287 return -1;
4288 }
4289 /* level 3 */
4290 i = map->level23[16*map->count2 + 128*i + l3];
4291 if (i == 0) {
4292 return -1;
4293 }
4294 return i;
4295}
4296
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297/* Lookup the character ch in the mapping. If the character
4298 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004299 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 PyObject *w = PyInt_FromLong((long)c);
4303 PyObject *x;
4304
4305 if (w == NULL)
4306 return NULL;
4307 x = PyObject_GetItem(mapping, w);
4308 Py_DECREF(w);
4309 if (x == NULL) {
4310 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4311 /* No mapping found means: mapping is undefined. */
4312 PyErr_Clear();
4313 x = Py_None;
4314 Py_INCREF(x);
4315 return x;
4316 } else
4317 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004319 else if (x == Py_None)
4320 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321 else if (PyInt_Check(x)) {
4322 long value = PyInt_AS_LONG(x);
4323 if (value < 0 || value > 255) {
4324 PyErr_SetString(PyExc_TypeError,
4325 "character mapping must be in range(256)");
4326 Py_DECREF(x);
4327 return NULL;
4328 }
4329 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 else if (PyString_Check(x))
4332 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004334 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004335 PyErr_Format(PyExc_TypeError,
4336 "character mapping must return integer, None or str8, not %.400s",
4337 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338 Py_DECREF(x);
4339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340 }
4341}
4342
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004343static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004344charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004345{
Walter Dörwald827b0552007-05-12 13:23:53 +00004346 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004347 /* exponentially overallocate to minimize reallocations */
4348 if (requiredsize < 2*outsize)
4349 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004350 if (PyBytes_Resize(outobj, requiredsize)) {
4351 Py_DECREF(outobj);
4352 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004353 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004354 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004355}
4356
4357typedef enum charmapencode_result {
4358 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4359}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004361 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004362 space is available. Return a new reference to the object that
4363 was put in the output buffer, or Py_None, if the mapping was undefined
4364 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004365 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004367charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004368 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004370 PyObject *rep;
4371 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004372 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004373
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004374 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004375 int res = encoding_map_lookup(c, mapping);
4376 Py_ssize_t requiredsize = *outpos+1;
4377 if (res == -1)
4378 return enc_FAILED;
4379 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004380 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004381 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004382 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004383 outstart[(*outpos)++] = (char)res;
4384 return enc_SUCCESS;
4385 }
4386
4387 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004389 return enc_EXCEPTION;
4390 else if (rep==Py_None) {
4391 Py_DECREF(rep);
4392 return enc_FAILED;
4393 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004395 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004396 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004397 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004399 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004401 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4403 }
4404 else {
4405 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004406 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4407 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004408 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004409 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004411 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004413 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 memcpy(outstart + *outpos, repchars, repsize);
4415 *outpos += repsize;
4416 }
4417 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004418 Py_DECREF(rep);
4419 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420}
4421
4422/* handle an error in PyUnicode_EncodeCharmap
4423 Return 0 on success, -1 on error */
4424static
4425int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004426 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004428 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004429 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430{
4431 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004432 Py_ssize_t repsize;
4433 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 Py_UNICODE *uni2;
4435 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004436 Py_ssize_t collstartpos = *inpos;
4437 Py_ssize_t collendpos = *inpos+1;
4438 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 char *encoding = "charmap";
4440 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004441 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 /* find all unencodable characters */
4444 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004445 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004446 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004447 int res = encoding_map_lookup(p[collendpos], mapping);
4448 if (res != -1)
4449 break;
4450 ++collendpos;
4451 continue;
4452 }
4453
4454 rep = charmapencode_lookup(p[collendpos], mapping);
4455 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004457 else if (rep!=Py_None) {
4458 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 break;
4460 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004461 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462 ++collendpos;
4463 }
4464 /* cache callback name lookup
4465 * (if not done yet, i.e. it's the first error) */
4466 if (*known_errorHandler==-1) {
4467 if ((errors==NULL) || (!strcmp(errors, "strict")))
4468 *known_errorHandler = 1;
4469 else if (!strcmp(errors, "replace"))
4470 *known_errorHandler = 2;
4471 else if (!strcmp(errors, "ignore"))
4472 *known_errorHandler = 3;
4473 else if (!strcmp(errors, "xmlcharrefreplace"))
4474 *known_errorHandler = 4;
4475 else
4476 *known_errorHandler = 0;
4477 }
4478 switch (*known_errorHandler) {
4479 case 1: /* strict */
4480 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4481 return -1;
4482 case 2: /* replace */
4483 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4484 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004485 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 return -1;
4487 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004488 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4490 return -1;
4491 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 }
4493 /* fall through */
4494 case 3: /* ignore */
4495 *inpos = collendpos;
4496 break;
4497 case 4: /* xmlcharrefreplace */
4498 /* generate replacement (temporarily (mis)uses p) */
4499 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4500 char buffer[2+29+1+1];
4501 char *cp;
4502 sprintf(buffer, "&#%d;", (int)p[collpos]);
4503 for (cp = buffer; *cp; ++cp) {
4504 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004505 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004507 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4509 return -1;
4510 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 }
4512 }
4513 *inpos = collendpos;
4514 break;
4515 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004516 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517 encoding, reason, p, size, exceptionObject,
4518 collstartpos, collendpos, &newpos);
4519 if (repunicode == NULL)
4520 return -1;
4521 /* generate replacement */
4522 repsize = PyUnicode_GET_SIZE(repunicode);
4523 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4524 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004525 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 return -1;
4527 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004528 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4531 return -1;
4532 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533 }
4534 *inpos = newpos;
4535 Py_DECREF(repunicode);
4536 }
4537 return 0;
4538}
4539
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004541 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542 PyObject *mapping,
4543 const char *errors)
4544{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 /* output object */
4546 PyObject *res = NULL;
4547 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004548 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004550 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 PyObject *errorHandler = NULL;
4552 PyObject *exc = NULL;
4553 /* the following variable is used for caching string comparisons
4554 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4555 * 3=ignore, 4=xmlcharrefreplace */
4556 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557
4558 /* Default to Latin-1 */
4559 if (mapping == NULL)
4560 return PyUnicode_EncodeLatin1(p, size, errors);
4561
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 /* allocate enough for a simple encoding without
4563 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004564 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 if (res == NULL)
4566 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004567 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570 while (inpos<size) {
4571 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004572 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004573 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004575 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 if (charmap_encoding_error(p, size, &inpos, mapping,
4577 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004578 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004579 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004580 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 else
4584 /* done with this character => adjust input position */
4585 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004589 if (respos<PyBytes_GET_SIZE(res)) {
4590 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 goto onError;
4592 }
4593 Py_XDECREF(exc);
4594 Py_XDECREF(errorHandler);
4595 return res;
4596
4597 onError:
4598 Py_XDECREF(res);
4599 Py_XDECREF(exc);
4600 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601 return NULL;
4602}
4603
4604PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4605 PyObject *mapping)
4606{
4607 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4608 PyErr_BadArgument();
4609 return NULL;
4610 }
4611 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4612 PyUnicode_GET_SIZE(unicode),
4613 mapping,
4614 NULL);
4615}
4616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617/* create or adjust a UnicodeTranslateError */
4618static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004619 const Py_UNICODE *unicode, Py_ssize_t size,
4620 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 if (*exceptionObject == NULL) {
4624 *exceptionObject = PyUnicodeTranslateError_Create(
4625 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626 }
4627 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4629 goto onError;
4630 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4631 goto onError;
4632 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4633 goto onError;
4634 return;
4635 onError:
4636 Py_DECREF(*exceptionObject);
4637 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004638 }
4639}
4640
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641/* raises a UnicodeTranslateError */
4642static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004643 const Py_UNICODE *unicode, Py_ssize_t size,
4644 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 const char *reason)
4646{
4647 make_translate_exception(exceptionObject,
4648 unicode, size, startpos, endpos, reason);
4649 if (*exceptionObject != NULL)
4650 PyCodec_StrictErrors(*exceptionObject);
4651}
4652
4653/* error handling callback helper:
4654 build arguments, call the callback and check the arguments,
4655 put the result into newpos and return the replacement string, which
4656 has to be freed by the caller */
4657static PyObject *unicode_translate_call_errorhandler(const char *errors,
4658 PyObject **errorHandler,
4659 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004660 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4661 Py_ssize_t startpos, Py_ssize_t endpos,
4662 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004664 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004665
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004666 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004667 PyObject *restuple;
4668 PyObject *resunicode;
4669
4670 if (*errorHandler == NULL) {
4671 *errorHandler = PyCodec_LookupError(errors);
4672 if (*errorHandler == NULL)
4673 return NULL;
4674 }
4675
4676 make_translate_exception(exceptionObject,
4677 unicode, size, startpos, endpos, reason);
4678 if (*exceptionObject == NULL)
4679 return NULL;
4680
4681 restuple = PyObject_CallFunctionObjArgs(
4682 *errorHandler, *exceptionObject, NULL);
4683 if (restuple == NULL)
4684 return NULL;
4685 if (!PyTuple_Check(restuple)) {
4686 PyErr_Format(PyExc_TypeError, &argparse[4]);
4687 Py_DECREF(restuple);
4688 return NULL;
4689 }
4690 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004691 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692 Py_DECREF(restuple);
4693 return NULL;
4694 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004695 if (i_newpos<0)
4696 *newpos = size+i_newpos;
4697 else
4698 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004699 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004700 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004701 Py_DECREF(restuple);
4702 return NULL;
4703 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004704 Py_INCREF(resunicode);
4705 Py_DECREF(restuple);
4706 return resunicode;
4707}
4708
4709/* Lookup the character ch in the mapping and put the result in result,
4710 which must be decrefed by the caller.
4711 Return 0 on success, -1 on error */
4712static
4713int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4714{
4715 PyObject *w = PyInt_FromLong((long)c);
4716 PyObject *x;
4717
4718 if (w == NULL)
4719 return -1;
4720 x = PyObject_GetItem(mapping, w);
4721 Py_DECREF(w);
4722 if (x == NULL) {
4723 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4724 /* No mapping found means: use 1:1 mapping. */
4725 PyErr_Clear();
4726 *result = NULL;
4727 return 0;
4728 } else
4729 return -1;
4730 }
4731 else if (x == Py_None) {
4732 *result = x;
4733 return 0;
4734 }
4735 else if (PyInt_Check(x)) {
4736 long value = PyInt_AS_LONG(x);
4737 long max = PyUnicode_GetMax();
4738 if (value < 0 || value > max) {
4739 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004740 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004741 Py_DECREF(x);
4742 return -1;
4743 }
4744 *result = x;
4745 return 0;
4746 }
4747 else if (PyUnicode_Check(x)) {
4748 *result = x;
4749 return 0;
4750 }
4751 else {
4752 /* wrong return value */
4753 PyErr_SetString(PyExc_TypeError,
4754 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004755 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004756 return -1;
4757 }
4758}
4759/* ensure that *outobj is at least requiredsize characters long,
4760if not reallocate and adjust various state variables.
4761Return 0 on success, -1 on error */
4762static
Walter Dörwald4894c302003-10-24 14:25:28 +00004763int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004764 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004765{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004766 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004767 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004769 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004771 if (requiredsize < 2 * oldsize)
4772 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004773 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 return -1;
4775 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776 }
4777 return 0;
4778}
4779/* lookup the character, put the result in the output string and adjust
4780 various state variables. Return a new reference to the object that
4781 was put in the output buffer in *result, or Py_None, if the mapping was
4782 undefined (in which case no character was written).
4783 The called must decref result.
4784 Return 0 on success, -1 on error. */
4785static
Walter Dörwald4894c302003-10-24 14:25:28 +00004786int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004787 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004788 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789{
Walter Dörwald4894c302003-10-24 14:25:28 +00004790 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791 return -1;
4792 if (*res==NULL) {
4793 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004794 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 }
4796 else if (*res==Py_None)
4797 ;
4798 else if (PyInt_Check(*res)) {
4799 /* no overflow check, because we know that the space is enough */
4800 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4801 }
4802 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004803 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804 if (repsize==1) {
4805 /* no overflow check, because we know that the space is enough */
4806 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4807 }
4808 else if (repsize!=0) {
4809 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004810 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004811 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004812 repsize - 1;
4813 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 return -1;
4815 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4816 *outp += repsize;
4817 }
4818 }
4819 else
4820 return -1;
4821 return 0;
4822}
4823
4824PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004825 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 PyObject *mapping,
4827 const char *errors)
4828{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004829 /* output object */
4830 PyObject *res = NULL;
4831 /* pointers to the beginning and end+1 of input */
4832 const Py_UNICODE *startp = p;
4833 const Py_UNICODE *endp = p + size;
4834 /* pointer into the output */
4835 Py_UNICODE *str;
4836 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004837 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004838 char *reason = "character maps to <undefined>";
4839 PyObject *errorHandler = NULL;
4840 PyObject *exc = NULL;
4841 /* the following variable is used for caching string comparisons
4842 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4843 * 3=ignore, 4=xmlcharrefreplace */
4844 int known_errorHandler = -1;
4845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 if (mapping == NULL) {
4847 PyErr_BadArgument();
4848 return NULL;
4849 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850
4851 /* allocate enough for a simple 1:1 translation without
4852 replacements, if we need more, we'll resize */
4853 res = PyUnicode_FromUnicode(NULL, size);
4854 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004855 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004857 return res;
4858 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860 while (p<endp) {
4861 /* try to encode it */
4862 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004863 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004864 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 goto onError;
4866 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004867 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868 if (x!=Py_None) /* it worked => adjust input pointer */
4869 ++p;
4870 else { /* untranslatable character */
4871 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004872 Py_ssize_t repsize;
4873 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004874 Py_UNICODE *uni2;
4875 /* startpos for collecting untranslatable chars */
4876 const Py_UNICODE *collstart = p;
4877 const Py_UNICODE *collend = p+1;
4878 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004880 /* find all untranslatable characters */
4881 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004882 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 goto onError;
4884 Py_XDECREF(x);
4885 if (x!=Py_None)
4886 break;
4887 ++collend;
4888 }
4889 /* cache callback name lookup
4890 * (if not done yet, i.e. it's the first error) */
4891 if (known_errorHandler==-1) {
4892 if ((errors==NULL) || (!strcmp(errors, "strict")))
4893 known_errorHandler = 1;
4894 else if (!strcmp(errors, "replace"))
4895 known_errorHandler = 2;
4896 else if (!strcmp(errors, "ignore"))
4897 known_errorHandler = 3;
4898 else if (!strcmp(errors, "xmlcharrefreplace"))
4899 known_errorHandler = 4;
4900 else
4901 known_errorHandler = 0;
4902 }
4903 switch (known_errorHandler) {
4904 case 1: /* strict */
4905 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4906 goto onError;
4907 case 2: /* replace */
4908 /* No need to check for space, this is a 1:1 replacement */
4909 for (coll = collstart; coll<collend; ++coll)
4910 *str++ = '?';
4911 /* fall through */
4912 case 3: /* ignore */
4913 p = collend;
4914 break;
4915 case 4: /* xmlcharrefreplace */
4916 /* generate replacement (temporarily (mis)uses p) */
4917 for (p = collstart; p < collend; ++p) {
4918 char buffer[2+29+1+1];
4919 char *cp;
4920 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004921 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4923 goto onError;
4924 for (cp = buffer; *cp; ++cp)
4925 *str++ = *cp;
4926 }
4927 p = collend;
4928 break;
4929 default:
4930 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4931 reason, startp, size, &exc,
4932 collstart-startp, collend-startp, &newpos);
4933 if (repunicode == NULL)
4934 goto onError;
4935 /* generate replacement */
4936 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004937 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004938 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4939 Py_DECREF(repunicode);
4940 goto onError;
4941 }
4942 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4943 *str++ = *uni2;
4944 p = startp + newpos;
4945 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946 }
4947 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949 /* Resize if we allocated to much */
4950 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004951 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004952 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004953 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004954 }
4955 Py_XDECREF(exc);
4956 Py_XDECREF(errorHandler);
4957 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959 onError:
4960 Py_XDECREF(res);
4961 Py_XDECREF(exc);
4962 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963 return NULL;
4964}
4965
4966PyObject *PyUnicode_Translate(PyObject *str,
4967 PyObject *mapping,
4968 const char *errors)
4969{
4970 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004971
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972 str = PyUnicode_FromObject(str);
4973 if (str == NULL)
4974 goto onError;
4975 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4976 PyUnicode_GET_SIZE(str),
4977 mapping,
4978 errors);
4979 Py_DECREF(str);
4980 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004981
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982 onError:
4983 Py_XDECREF(str);
4984 return NULL;
4985}
Tim Petersced69f82003-09-16 20:30:58 +00004986
Guido van Rossum9e896b32000-04-05 20:11:21 +00004987/* --- Decimal Encoder ---------------------------------------------------- */
4988
4989int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004990 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004991 char *output,
4992 const char *errors)
4993{
4994 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004995 PyObject *errorHandler = NULL;
4996 PyObject *exc = NULL;
4997 const char *encoding = "decimal";
4998 const char *reason = "invalid decimal Unicode string";
4999 /* the following variable is used for caching string comparisons
5000 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5001 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005002
5003 if (output == NULL) {
5004 PyErr_BadArgument();
5005 return -1;
5006 }
5007
5008 p = s;
5009 end = s + length;
5010 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005011 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005012 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005013 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005014 Py_ssize_t repsize;
5015 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005016 Py_UNICODE *uni2;
5017 Py_UNICODE *collstart;
5018 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005019
Guido van Rossum9e896b32000-04-05 20:11:21 +00005020 if (Py_UNICODE_ISSPACE(ch)) {
5021 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005023 continue;
5024 }
5025 decimal = Py_UNICODE_TODECIMAL(ch);
5026 if (decimal >= 0) {
5027 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005028 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005029 continue;
5030 }
Guido van Rossumba477042000-04-06 18:18:10 +00005031 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005032 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005033 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005034 continue;
5035 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005036 /* All other characters are considered unencodable */
5037 collstart = p;
5038 collend = p+1;
5039 while (collend < end) {
5040 if ((0 < *collend && *collend < 256) ||
5041 !Py_UNICODE_ISSPACE(*collend) ||
5042 Py_UNICODE_TODECIMAL(*collend))
5043 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005044 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 /* cache callback name lookup
5046 * (if not done yet, i.e. it's the first error) */
5047 if (known_errorHandler==-1) {
5048 if ((errors==NULL) || (!strcmp(errors, "strict")))
5049 known_errorHandler = 1;
5050 else if (!strcmp(errors, "replace"))
5051 known_errorHandler = 2;
5052 else if (!strcmp(errors, "ignore"))
5053 known_errorHandler = 3;
5054 else if (!strcmp(errors, "xmlcharrefreplace"))
5055 known_errorHandler = 4;
5056 else
5057 known_errorHandler = 0;
5058 }
5059 switch (known_errorHandler) {
5060 case 1: /* strict */
5061 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5062 goto onError;
5063 case 2: /* replace */
5064 for (p = collstart; p < collend; ++p)
5065 *output++ = '?';
5066 /* fall through */
5067 case 3: /* ignore */
5068 p = collend;
5069 break;
5070 case 4: /* xmlcharrefreplace */
5071 /* generate replacement (temporarily (mis)uses p) */
5072 for (p = collstart; p < collend; ++p)
5073 output += sprintf(output, "&#%d;", (int)*p);
5074 p = collend;
5075 break;
5076 default:
5077 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5078 encoding, reason, s, length, &exc,
5079 collstart-s, collend-s, &newpos);
5080 if (repunicode == NULL)
5081 goto onError;
5082 /* generate replacement */
5083 repsize = PyUnicode_GET_SIZE(repunicode);
5084 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5085 Py_UNICODE ch = *uni2;
5086 if (Py_UNICODE_ISSPACE(ch))
5087 *output++ = ' ';
5088 else {
5089 decimal = Py_UNICODE_TODECIMAL(ch);
5090 if (decimal >= 0)
5091 *output++ = '0' + decimal;
5092 else if (0 < ch && ch < 256)
5093 *output++ = (char)ch;
5094 else {
5095 Py_DECREF(repunicode);
5096 raise_encode_exception(&exc, encoding,
5097 s, length, collstart-s, collend-s, reason);
5098 goto onError;
5099 }
5100 }
5101 }
5102 p = s + newpos;
5103 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005104 }
5105 }
5106 /* 0-terminate the output string */
5107 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005108 Py_XDECREF(exc);
5109 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005110 return 0;
5111
5112 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113 Py_XDECREF(exc);
5114 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005115 return -1;
5116}
5117
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118/* --- Helpers ------------------------------------------------------------ */
5119
Eric Smith8c663262007-08-25 02:26:07 +00005120#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005121
5122#include "stringlib/fastsearch.h"
5123
5124#include "stringlib/count.h"
5125#include "stringlib/find.h"
5126#include "stringlib/partition.h"
5127
5128/* helper macro to fixup start/end slice values */
5129#define FIX_START_END(obj) \
5130 if (start < 0) \
5131 start += (obj)->length; \
5132 if (start < 0) \
5133 start = 0; \
5134 if (end > (obj)->length) \
5135 end = (obj)->length; \
5136 if (end < 0) \
5137 end += (obj)->length; \
5138 if (end < 0) \
5139 end = 0;
5140
Martin v. Löwis18e16552006-02-15 17:27:45 +00005141Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005142 PyObject *substr,
5143 Py_ssize_t start,
5144 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005146 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005147 PyUnicodeObject* str_obj;
5148 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005149
Thomas Wouters477c8d52006-05-27 19:21:47 +00005150 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5151 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005153 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5154 if (!sub_obj) {
5155 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156 return -1;
5157 }
Tim Petersced69f82003-09-16 20:30:58 +00005158
Thomas Wouters477c8d52006-05-27 19:21:47 +00005159 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005160
Thomas Wouters477c8d52006-05-27 19:21:47 +00005161 result = stringlib_count(
5162 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5163 );
5164
5165 Py_DECREF(sub_obj);
5166 Py_DECREF(str_obj);
5167
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 return result;
5169}
5170
Martin v. Löwis18e16552006-02-15 17:27:45 +00005171Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005172 PyObject *sub,
5173 Py_ssize_t start,
5174 Py_ssize_t end,
5175 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005177 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005178
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005180 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005181 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005182 sub = PyUnicode_FromObject(sub);
5183 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005184 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005185 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 }
Tim Petersced69f82003-09-16 20:30:58 +00005187
Thomas Wouters477c8d52006-05-27 19:21:47 +00005188 if (direction > 0)
5189 result = stringlib_find_slice(
5190 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5191 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5192 start, end
5193 );
5194 else
5195 result = stringlib_rfind_slice(
5196 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5197 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5198 start, end
5199 );
5200
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005202 Py_DECREF(sub);
5203
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 return result;
5205}
5206
Tim Petersced69f82003-09-16 20:30:58 +00005207static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208int tailmatch(PyUnicodeObject *self,
5209 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005210 Py_ssize_t start,
5211 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212 int direction)
5213{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 if (substring->length == 0)
5215 return 1;
5216
Thomas Wouters477c8d52006-05-27 19:21:47 +00005217 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218
5219 end -= substring->length;
5220 if (end < start)
5221 return 0;
5222
5223 if (direction > 0) {
5224 if (Py_UNICODE_MATCH(self, end, substring))
5225 return 1;
5226 } else {
5227 if (Py_UNICODE_MATCH(self, start, substring))
5228 return 1;
5229 }
5230
5231 return 0;
5232}
5233
Martin v. Löwis18e16552006-02-15 17:27:45 +00005234Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005236 Py_ssize_t start,
5237 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238 int direction)
5239{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005240 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005241
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 str = PyUnicode_FromObject(str);
5243 if (str == NULL)
5244 return -1;
5245 substr = PyUnicode_FromObject(substr);
5246 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005247 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248 return -1;
5249 }
Tim Petersced69f82003-09-16 20:30:58 +00005250
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251 result = tailmatch((PyUnicodeObject *)str,
5252 (PyUnicodeObject *)substr,
5253 start, end, direction);
5254 Py_DECREF(str);
5255 Py_DECREF(substr);
5256 return result;
5257}
5258
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259/* Apply fixfct filter to the Unicode object self and return a
5260 reference to the modified object */
5261
Tim Petersced69f82003-09-16 20:30:58 +00005262static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263PyObject *fixup(PyUnicodeObject *self,
5264 int (*fixfct)(PyUnicodeObject *s))
5265{
5266
5267 PyUnicodeObject *u;
5268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005269 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 if (u == NULL)
5271 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005272
5273 Py_UNICODE_COPY(u->str, self->str, self->length);
5274
Tim Peters7a29bd52001-09-12 03:03:31 +00005275 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276 /* fixfct should return TRUE if it modified the buffer. If
5277 FALSE, return a reference to the original buffer instead
5278 (to save space, not time) */
5279 Py_INCREF(self);
5280 Py_DECREF(u);
5281 return (PyObject*) self;
5282 }
5283 return (PyObject*) u;
5284}
5285
Tim Petersced69f82003-09-16 20:30:58 +00005286static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287int fixupper(PyUnicodeObject *self)
5288{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005289 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 Py_UNICODE *s = self->str;
5291 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005292
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293 while (len-- > 0) {
5294 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005295
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296 ch = Py_UNICODE_TOUPPER(*s);
5297 if (ch != *s) {
5298 status = 1;
5299 *s = ch;
5300 }
5301 s++;
5302 }
5303
5304 return status;
5305}
5306
Tim Petersced69f82003-09-16 20:30:58 +00005307static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308int fixlower(PyUnicodeObject *self)
5309{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005310 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 Py_UNICODE *s = self->str;
5312 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005313
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314 while (len-- > 0) {
5315 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005316
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317 ch = Py_UNICODE_TOLOWER(*s);
5318 if (ch != *s) {
5319 status = 1;
5320 *s = ch;
5321 }
5322 s++;
5323 }
5324
5325 return status;
5326}
5327
Tim Petersced69f82003-09-16 20:30:58 +00005328static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329int fixswapcase(PyUnicodeObject *self)
5330{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005331 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 Py_UNICODE *s = self->str;
5333 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005334
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335 while (len-- > 0) {
5336 if (Py_UNICODE_ISUPPER(*s)) {
5337 *s = Py_UNICODE_TOLOWER(*s);
5338 status = 1;
5339 } else if (Py_UNICODE_ISLOWER(*s)) {
5340 *s = Py_UNICODE_TOUPPER(*s);
5341 status = 1;
5342 }
5343 s++;
5344 }
5345
5346 return status;
5347}
5348
Tim Petersced69f82003-09-16 20:30:58 +00005349static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350int fixcapitalize(PyUnicodeObject *self)
5351{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005352 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005353 Py_UNICODE *s = self->str;
5354 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005355
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005356 if (len == 0)
5357 return 0;
5358 if (Py_UNICODE_ISLOWER(*s)) {
5359 *s = Py_UNICODE_TOUPPER(*s);
5360 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005362 s++;
5363 while (--len > 0) {
5364 if (Py_UNICODE_ISUPPER(*s)) {
5365 *s = Py_UNICODE_TOLOWER(*s);
5366 status = 1;
5367 }
5368 s++;
5369 }
5370 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371}
5372
5373static
5374int fixtitle(PyUnicodeObject *self)
5375{
5376 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5377 register Py_UNICODE *e;
5378 int previous_is_cased;
5379
5380 /* Shortcut for single character strings */
5381 if (PyUnicode_GET_SIZE(self) == 1) {
5382 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5383 if (*p != ch) {
5384 *p = ch;
5385 return 1;
5386 }
5387 else
5388 return 0;
5389 }
Tim Petersced69f82003-09-16 20:30:58 +00005390
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 e = p + PyUnicode_GET_SIZE(self);
5392 previous_is_cased = 0;
5393 for (; p < e; p++) {
5394 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005395
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 if (previous_is_cased)
5397 *p = Py_UNICODE_TOLOWER(ch);
5398 else
5399 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005400
5401 if (Py_UNICODE_ISLOWER(ch) ||
5402 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 Py_UNICODE_ISTITLE(ch))
5404 previous_is_cased = 1;
5405 else
5406 previous_is_cased = 0;
5407 }
5408 return 1;
5409}
5410
Tim Peters8ce9f162004-08-27 01:49:32 +00005411PyObject *
5412PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413{
Tim Peters8ce9f162004-08-27 01:49:32 +00005414 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005415 const Py_UNICODE blank = ' ';
5416 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005417 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005418 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005419 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5420 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005421 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5422 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005423 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005424 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005425 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426
Tim Peters05eba1f2004-08-27 21:32:02 +00005427 fseq = PySequence_Fast(seq, "");
5428 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005429 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005430 }
5431
Tim Peters91879ab2004-08-27 22:35:44 +00005432 /* Grrrr. A codec may be invoked to convert str objects to
5433 * Unicode, and so it's possible to call back into Python code
5434 * during PyUnicode_FromObject(), and so it's possible for a sick
5435 * codec to change the size of fseq (if seq is a list). Therefore
5436 * we have to keep refetching the size -- can't assume seqlen
5437 * is invariant.
5438 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005439 seqlen = PySequence_Fast_GET_SIZE(fseq);
5440 /* If empty sequence, return u"". */
5441 if (seqlen == 0) {
5442 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5443 goto Done;
5444 }
5445 /* If singleton sequence with an exact Unicode, return that. */
5446 if (seqlen == 1) {
5447 item = PySequence_Fast_GET_ITEM(fseq, 0);
5448 if (PyUnicode_CheckExact(item)) {
5449 Py_INCREF(item);
5450 res = (PyUnicodeObject *)item;
5451 goto Done;
5452 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005453 }
5454
Tim Peters05eba1f2004-08-27 21:32:02 +00005455 /* At least two items to join, or one that isn't exact Unicode. */
5456 if (seqlen > 1) {
5457 /* Set up sep and seplen -- they're needed. */
5458 if (separator == NULL) {
5459 sep = &blank;
5460 seplen = 1;
5461 }
5462 else {
5463 internal_separator = PyUnicode_FromObject(separator);
5464 if (internal_separator == NULL)
5465 goto onError;
5466 sep = PyUnicode_AS_UNICODE(internal_separator);
5467 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005468 /* In case PyUnicode_FromObject() mutated seq. */
5469 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005470 }
5471 }
5472
5473 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005474 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005475 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005476 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005477 res_p = PyUnicode_AS_UNICODE(res);
5478 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005479
Tim Peters05eba1f2004-08-27 21:32:02 +00005480 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005481 Py_ssize_t itemlen;
5482 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005483
5484 item = PySequence_Fast_GET_ITEM(fseq, i);
5485 /* Convert item to Unicode. */
Guido van Rossumf1044292007-09-27 18:01:22 +00005486 if (!PyString_Check(item) && !PyUnicode_Check(item))
5487 {
5488 if (PyBytes_Check(item))
5489 {
5490 PyErr_Format(PyExc_TypeError,
5491 "sequence item %d: join() will not operate on "
5492 "bytes objects", i);
5493 goto onError;
5494 }
5495 item = PyObject_Unicode(item);
Tim Peters8ce9f162004-08-27 01:49:32 +00005496 }
Guido van Rossumf1044292007-09-27 18:01:22 +00005497 else
5498 item = PyUnicode_FromObject(item);
5499
Tim Peters05eba1f2004-08-27 21:32:02 +00005500 if (item == NULL)
5501 goto onError;
5502 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005503
Tim Peters91879ab2004-08-27 22:35:44 +00005504 /* In case PyUnicode_FromObject() mutated seq. */
5505 seqlen = PySequence_Fast_GET_SIZE(fseq);
5506
Tim Peters8ce9f162004-08-27 01:49:32 +00005507 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005509 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005510 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005511 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005512 if (i < seqlen - 1) {
5513 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005514 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005515 goto Overflow;
5516 }
5517 if (new_res_used > res_alloc) {
5518 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005519 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005520 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005521 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005522 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005523 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005524 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005525 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005527 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005528 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005530
5531 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005532 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005533 res_p += itemlen;
5534 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005535 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005536 res_p += seplen;
5537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005539 res_used = new_res_used;
5540 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005541
Tim Peters05eba1f2004-08-27 21:32:02 +00005542 /* Shrink res to match the used area; this probably can't fail,
5543 * but it's cheap to check.
5544 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005545 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005546 goto onError;
5547
5548 Done:
5549 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005550 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 return (PyObject *)res;
5552
Tim Peters8ce9f162004-08-27 01:49:32 +00005553 Overflow:
5554 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005555 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005556 Py_DECREF(item);
5557 /* fall through */
5558
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005560 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005561 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005562 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 return NULL;
5564}
5565
Tim Petersced69f82003-09-16 20:30:58 +00005566static
5567PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005568 Py_ssize_t left,
5569 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 Py_UNICODE fill)
5571{
5572 PyUnicodeObject *u;
5573
5574 if (left < 0)
5575 left = 0;
5576 if (right < 0)
5577 right = 0;
5578
Tim Peters7a29bd52001-09-12 03:03:31 +00005579 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 Py_INCREF(self);
5581 return self;
5582 }
5583
5584 u = _PyUnicode_New(left + self->length + right);
5585 if (u) {
5586 if (left)
5587 Py_UNICODE_FILL(u->str, fill, left);
5588 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5589 if (right)
5590 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5591 }
5592
5593 return u;
5594}
5595
5596#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005597 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 if (!str) \
5599 goto onError; \
5600 if (PyList_Append(list, str)) { \
5601 Py_DECREF(str); \
5602 goto onError; \
5603 } \
5604 else \
5605 Py_DECREF(str);
5606
5607static
5608PyObject *split_whitespace(PyUnicodeObject *self,
5609 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005610 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005612 register Py_ssize_t i;
5613 register Py_ssize_t j;
5614 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 PyObject *str;
5616
5617 for (i = j = 0; i < len; ) {
5618 /* find a token */
5619 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5620 i++;
5621 j = i;
5622 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5623 i++;
5624 if (j < i) {
5625 if (maxcount-- <= 0)
5626 break;
5627 SPLIT_APPEND(self->str, j, i);
5628 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5629 i++;
5630 j = i;
5631 }
5632 }
5633 if (j < len) {
5634 SPLIT_APPEND(self->str, j, len);
5635 }
5636 return list;
5637
5638 onError:
5639 Py_DECREF(list);
5640 return NULL;
5641}
5642
5643PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005644 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005646 register Py_ssize_t i;
5647 register Py_ssize_t j;
5648 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 PyObject *list;
5650 PyObject *str;
5651 Py_UNICODE *data;
5652
5653 string = PyUnicode_FromObject(string);
5654 if (string == NULL)
5655 return NULL;
5656 data = PyUnicode_AS_UNICODE(string);
5657 len = PyUnicode_GET_SIZE(string);
5658
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 list = PyList_New(0);
5660 if (!list)
5661 goto onError;
5662
5663 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005664 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005665
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005667 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669
5670 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005671 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672 if (i < len) {
5673 if (data[i] == '\r' && i + 1 < len &&
5674 data[i+1] == '\n')
5675 i += 2;
5676 else
5677 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005678 if (keepends)
5679 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 }
Guido van Rossum86662912000-04-11 15:38:46 +00005681 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 j = i;
5683 }
5684 if (j < len) {
5685 SPLIT_APPEND(data, j, len);
5686 }
5687
5688 Py_DECREF(string);
5689 return list;
5690
5691 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005692 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 Py_DECREF(string);
5694 return NULL;
5695}
5696
Tim Petersced69f82003-09-16 20:30:58 +00005697static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698PyObject *split_char(PyUnicodeObject *self,
5699 PyObject *list,
5700 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005701 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005703 register Py_ssize_t i;
5704 register Py_ssize_t j;
5705 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 PyObject *str;
5707
5708 for (i = j = 0; i < len; ) {
5709 if (self->str[i] == ch) {
5710 if (maxcount-- <= 0)
5711 break;
5712 SPLIT_APPEND(self->str, j, i);
5713 i = j = i + 1;
5714 } else
5715 i++;
5716 }
5717 if (j <= len) {
5718 SPLIT_APPEND(self->str, j, len);
5719 }
5720 return list;
5721
5722 onError:
5723 Py_DECREF(list);
5724 return NULL;
5725}
5726
Tim Petersced69f82003-09-16 20:30:58 +00005727static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728PyObject *split_substring(PyUnicodeObject *self,
5729 PyObject *list,
5730 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005731 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005733 register Py_ssize_t i;
5734 register Py_ssize_t j;
5735 Py_ssize_t len = self->length;
5736 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 PyObject *str;
5738
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005739 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 if (Py_UNICODE_MATCH(self, i, substring)) {
5741 if (maxcount-- <= 0)
5742 break;
5743 SPLIT_APPEND(self->str, j, i);
5744 i = j = i + sublen;
5745 } else
5746 i++;
5747 }
5748 if (j <= len) {
5749 SPLIT_APPEND(self->str, j, len);
5750 }
5751 return list;
5752
5753 onError:
5754 Py_DECREF(list);
5755 return NULL;
5756}
5757
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005758static
5759PyObject *rsplit_whitespace(PyUnicodeObject *self,
5760 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005761 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005762{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005763 register Py_ssize_t i;
5764 register Py_ssize_t j;
5765 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005766 PyObject *str;
5767
5768 for (i = j = len - 1; i >= 0; ) {
5769 /* find a token */
5770 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5771 i--;
5772 j = i;
5773 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5774 i--;
5775 if (j > i) {
5776 if (maxcount-- <= 0)
5777 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005778 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005779 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5780 i--;
5781 j = i;
5782 }
5783 }
5784 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005785 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005786 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005787 if (PyList_Reverse(list) < 0)
5788 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005789 return list;
5790
5791 onError:
5792 Py_DECREF(list);
5793 return NULL;
5794}
5795
5796static
5797PyObject *rsplit_char(PyUnicodeObject *self,
5798 PyObject *list,
5799 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005800 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005801{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005802 register Py_ssize_t i;
5803 register Py_ssize_t j;
5804 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005805 PyObject *str;
5806
5807 for (i = j = len - 1; i >= 0; ) {
5808 if (self->str[i] == ch) {
5809 if (maxcount-- <= 0)
5810 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005811 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005812 j = i = i - 1;
5813 } else
5814 i--;
5815 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005816 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005817 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005818 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005819 if (PyList_Reverse(list) < 0)
5820 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005821 return list;
5822
5823 onError:
5824 Py_DECREF(list);
5825 return NULL;
5826}
5827
5828static
5829PyObject *rsplit_substring(PyUnicodeObject *self,
5830 PyObject *list,
5831 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005832 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005833{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005834 register Py_ssize_t i;
5835 register Py_ssize_t j;
5836 Py_ssize_t len = self->length;
5837 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005838 PyObject *str;
5839
5840 for (i = len - sublen, j = len; i >= 0; ) {
5841 if (Py_UNICODE_MATCH(self, i, substring)) {
5842 if (maxcount-- <= 0)
5843 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005844 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005845 j = i;
5846 i -= sublen;
5847 } else
5848 i--;
5849 }
5850 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005851 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005852 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005853 if (PyList_Reverse(list) < 0)
5854 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005855 return list;
5856
5857 onError:
5858 Py_DECREF(list);
5859 return NULL;
5860}
5861
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862#undef SPLIT_APPEND
5863
5864static
5865PyObject *split(PyUnicodeObject *self,
5866 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005867 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868{
5869 PyObject *list;
5870
5871 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005872 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873
5874 list = PyList_New(0);
5875 if (!list)
5876 return NULL;
5877
5878 if (substring == NULL)
5879 return split_whitespace(self,list,maxcount);
5880
5881 else if (substring->length == 1)
5882 return split_char(self,list,substring->str[0],maxcount);
5883
5884 else if (substring->length == 0) {
5885 Py_DECREF(list);
5886 PyErr_SetString(PyExc_ValueError, "empty separator");
5887 return NULL;
5888 }
5889 else
5890 return split_substring(self,list,substring,maxcount);
5891}
5892
Tim Petersced69f82003-09-16 20:30:58 +00005893static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005894PyObject *rsplit(PyUnicodeObject *self,
5895 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005896 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005897{
5898 PyObject *list;
5899
5900 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005901 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005902
5903 list = PyList_New(0);
5904 if (!list)
5905 return NULL;
5906
5907 if (substring == NULL)
5908 return rsplit_whitespace(self,list,maxcount);
5909
5910 else if (substring->length == 1)
5911 return rsplit_char(self,list,substring->str[0],maxcount);
5912
5913 else if (substring->length == 0) {
5914 Py_DECREF(list);
5915 PyErr_SetString(PyExc_ValueError, "empty separator");
5916 return NULL;
5917 }
5918 else
5919 return rsplit_substring(self,list,substring,maxcount);
5920}
5921
5922static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923PyObject *replace(PyUnicodeObject *self,
5924 PyUnicodeObject *str1,
5925 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005926 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927{
5928 PyUnicodeObject *u;
5929
5930 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005931 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932
Thomas Wouters477c8d52006-05-27 19:21:47 +00005933 if (str1->length == str2->length) {
5934 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005935 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005936 if (str1->length == 1) {
5937 /* replace characters */
5938 Py_UNICODE u1, u2;
5939 if (!findchar(self->str, self->length, str1->str[0]))
5940 goto nothing;
5941 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5942 if (!u)
5943 return NULL;
5944 Py_UNICODE_COPY(u->str, self->str, self->length);
5945 u1 = str1->str[0];
5946 u2 = str2->str[0];
5947 for (i = 0; i < u->length; i++)
5948 if (u->str[i] == u1) {
5949 if (--maxcount < 0)
5950 break;
5951 u->str[i] = u2;
5952 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005954 i = fastsearch(
5955 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005957 if (i < 0)
5958 goto nothing;
5959 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5960 if (!u)
5961 return NULL;
5962 Py_UNICODE_COPY(u->str, self->str, self->length);
5963 while (i <= self->length - str1->length)
5964 if (Py_UNICODE_MATCH(self, i, str1)) {
5965 if (--maxcount < 0)
5966 break;
5967 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5968 i += str1->length;
5969 } else
5970 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005973
5974 Py_ssize_t n, i, j, e;
5975 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 Py_UNICODE *p;
5977
5978 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005979 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 if (n > maxcount)
5981 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005982 if (n == 0)
5983 goto nothing;
5984 /* new_size = self->length + n * (str2->length - str1->length)); */
5985 delta = (str2->length - str1->length);
5986 if (delta == 0) {
5987 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005989 product = n * (str2->length - str1->length);
5990 if ((product / (str2->length - str1->length)) != n) {
5991 PyErr_SetString(PyExc_OverflowError,
5992 "replace string is too long");
5993 return NULL;
5994 }
5995 new_size = self->length + product;
5996 if (new_size < 0) {
5997 PyErr_SetString(PyExc_OverflowError,
5998 "replace string is too long");
5999 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 }
6001 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006002 u = _PyUnicode_New(new_size);
6003 if (!u)
6004 return NULL;
6005 i = 0;
6006 p = u->str;
6007 e = self->length - str1->length;
6008 if (str1->length > 0) {
6009 while (n-- > 0) {
6010 /* look for next match */
6011 j = i;
6012 while (j <= e) {
6013 if (Py_UNICODE_MATCH(self, j, str1))
6014 break;
6015 j++;
6016 }
6017 if (j > i) {
6018 if (j > e)
6019 break;
6020 /* copy unchanged part [i:j] */
6021 Py_UNICODE_COPY(p, self->str+i, j-i);
6022 p += j - i;
6023 }
6024 /* copy substitution string */
6025 if (str2->length > 0) {
6026 Py_UNICODE_COPY(p, str2->str, str2->length);
6027 p += str2->length;
6028 }
6029 i = j + str1->length;
6030 }
6031 if (i < self->length)
6032 /* copy tail [i:] */
6033 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6034 } else {
6035 /* interleave */
6036 while (n > 0) {
6037 Py_UNICODE_COPY(p, str2->str, str2->length);
6038 p += str2->length;
6039 if (--n <= 0)
6040 break;
6041 *p++ = self->str[i++];
6042 }
6043 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006047
6048nothing:
6049 /* nothing to replace; return original string (when possible) */
6050 if (PyUnicode_CheckExact(self)) {
6051 Py_INCREF(self);
6052 return (PyObject *) self;
6053 }
6054 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055}
6056
6057/* --- Unicode Object Methods --------------------------------------------- */
6058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006059PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060"S.title() -> unicode\n\
6061\n\
6062Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006063characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064
6065static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006066unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 return fixup(self, fixtitle);
6069}
6070
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006071PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072"S.capitalize() -> unicode\n\
6073\n\
6074Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006075have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076
6077static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006078unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 return fixup(self, fixcapitalize);
6081}
6082
6083#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006084PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085"S.capwords() -> unicode\n\
6086\n\
6087Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006088normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089
6090static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006091unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092{
6093 PyObject *list;
6094 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006095 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 /* Split into words */
6098 list = split(self, NULL, -1);
6099 if (!list)
6100 return NULL;
6101
6102 /* Capitalize each word */
6103 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6104 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6105 fixcapitalize);
6106 if (item == NULL)
6107 goto onError;
6108 Py_DECREF(PyList_GET_ITEM(list, i));
6109 PyList_SET_ITEM(list, i, item);
6110 }
6111
6112 /* Join the words to form a new string */
6113 item = PyUnicode_Join(NULL, list);
6114
6115onError:
6116 Py_DECREF(list);
6117 return (PyObject *)item;
6118}
6119#endif
6120
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006121/* Argument converter. Coerces to a single unicode character */
6122
6123static int
6124convert_uc(PyObject *obj, void *addr)
6125{
6126 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6127 PyObject *uniobj;
6128 Py_UNICODE *unistr;
6129
6130 uniobj = PyUnicode_FromObject(obj);
6131 if (uniobj == NULL) {
6132 PyErr_SetString(PyExc_TypeError,
6133 "The fill character cannot be converted to Unicode");
6134 return 0;
6135 }
6136 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6137 PyErr_SetString(PyExc_TypeError,
6138 "The fill character must be exactly one character long");
6139 Py_DECREF(uniobj);
6140 return 0;
6141 }
6142 unistr = PyUnicode_AS_UNICODE(uniobj);
6143 *fillcharloc = unistr[0];
6144 Py_DECREF(uniobj);
6145 return 1;
6146}
6147
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006148PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006149"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006151Return S centered in a Unicode string of length width. Padding is\n\
6152done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153
6154static PyObject *
6155unicode_center(PyUnicodeObject *self, PyObject *args)
6156{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006157 Py_ssize_t marg, left;
6158 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006159 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160
Thomas Woutersde017742006-02-16 19:34:37 +00006161 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 return NULL;
6163
Tim Peters7a29bd52001-09-12 03:03:31 +00006164 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 Py_INCREF(self);
6166 return (PyObject*) self;
6167 }
6168
6169 marg = width - self->length;
6170 left = marg / 2 + (marg & width & 1);
6171
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006172 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173}
6174
Marc-André Lemburge5034372000-08-08 08:04:29 +00006175#if 0
6176
6177/* This code should go into some future Unicode collation support
6178 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006179 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006180
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006181/* speedy UTF-16 code point order comparison */
6182/* gleaned from: */
6183/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6184
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006185static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006186{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006187 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006188 0, 0, 0, 0, 0, 0, 0, 0,
6189 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006190 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006191};
6192
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193static int
6194unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6195{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006196 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006197
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198 Py_UNICODE *s1 = str1->str;
6199 Py_UNICODE *s2 = str2->str;
6200
6201 len1 = str1->length;
6202 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006203
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006205 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006206
6207 c1 = *s1++;
6208 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006209
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006210 if (c1 > (1<<11) * 26)
6211 c1 += utf16Fixup[c1>>11];
6212 if (c2 > (1<<11) * 26)
6213 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006214 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006215
6216 if (c1 != c2)
6217 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006218
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006219 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 }
6221
6222 return (len1 < len2) ? -1 : (len1 != len2);
6223}
6224
Marc-André Lemburge5034372000-08-08 08:04:29 +00006225#else
6226
6227static int
6228unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6229{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006230 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006231
6232 Py_UNICODE *s1 = str1->str;
6233 Py_UNICODE *s2 = str2->str;
6234
6235 len1 = str1->length;
6236 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006237
Marc-André Lemburge5034372000-08-08 08:04:29 +00006238 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006239 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006240
Fredrik Lundh45714e92001-06-26 16:39:36 +00006241 c1 = *s1++;
6242 c2 = *s2++;
6243
6244 if (c1 != c2)
6245 return (c1 < c2) ? -1 : 1;
6246
Marc-André Lemburge5034372000-08-08 08:04:29 +00006247 len1--; len2--;
6248 }
6249
6250 return (len1 < len2) ? -1 : (len1 != len2);
6251}
6252
6253#endif
6254
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255int PyUnicode_Compare(PyObject *left,
6256 PyObject *right)
6257{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006258 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6259 return unicode_compare((PyUnicodeObject *)left,
6260 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006261 PyErr_Format(PyExc_TypeError,
6262 "Can't compare %.100s and %.100s",
6263 left->ob_type->tp_name,
6264 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 return -1;
6266}
6267
Martin v. Löwis5b222132007-06-10 09:51:05 +00006268int
6269PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6270{
6271 int i;
6272 Py_UNICODE *id;
6273 assert(PyUnicode_Check(uni));
6274 id = PyUnicode_AS_UNICODE(uni);
6275 /* Compare Unicode string and source character set string */
6276 for (i = 0; id[i] && str[i]; i++)
6277 if (id[i] != str[i])
6278 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6279 if (id[i])
6280 return 1; /* uni is longer */
6281 if (str[i])
6282 return -1; /* str is longer */
6283 return 0;
6284}
6285
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006286PyObject *PyUnicode_RichCompare(PyObject *left,
6287 PyObject *right,
6288 int op)
6289{
6290 int result;
6291
6292 result = PyUnicode_Compare(left, right);
6293 if (result == -1 && PyErr_Occurred())
6294 goto onError;
6295
6296 /* Convert the return value to a Boolean */
6297 switch (op) {
6298 case Py_EQ:
6299 result = (result == 0);
6300 break;
6301 case Py_NE:
6302 result = (result != 0);
6303 break;
6304 case Py_LE:
6305 result = (result <= 0);
6306 break;
6307 case Py_GE:
6308 result = (result >= 0);
6309 break;
6310 case Py_LT:
6311 result = (result == -1);
6312 break;
6313 case Py_GT:
6314 result = (result == 1);
6315 break;
6316 }
6317 return PyBool_FromLong(result);
6318
6319 onError:
6320
6321 /* Standard case
6322
6323 Type errors mean that PyUnicode_FromObject() could not convert
6324 one of the arguments (usually the right hand side) to Unicode,
6325 ie. we can't handle the comparison request. However, it is
6326 possible that the other object knows a comparison method, which
6327 is why we return Py_NotImplemented to give the other object a
6328 chance.
6329
6330 */
6331 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6332 PyErr_Clear();
6333 Py_INCREF(Py_NotImplemented);
6334 return Py_NotImplemented;
6335 }
6336 if (op != Py_EQ && op != Py_NE)
6337 return NULL;
6338
6339 /* Equality comparison.
6340
6341 This is a special case: we silence any PyExc_UnicodeDecodeError
6342 and instead turn it into a PyErr_UnicodeWarning.
6343
6344 */
6345 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6346 return NULL;
6347 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006348 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6349 (op == Py_EQ) ?
6350 "Unicode equal comparison "
6351 "failed to convert both arguments to Unicode - "
6352 "interpreting them as being unequal"
6353 :
6354 "Unicode unequal comparison "
6355 "failed to convert both arguments to Unicode - "
6356 "interpreting them as being unequal",
6357 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006358 return NULL;
6359 result = (op == Py_NE);
6360 return PyBool_FromLong(result);
6361}
6362
Guido van Rossum403d68b2000-03-13 15:55:09 +00006363int PyUnicode_Contains(PyObject *container,
6364 PyObject *element)
6365{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006366 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006367 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006368
6369 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006370 sub = PyUnicode_FromObject(element);
6371 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006372 PyErr_Format(PyExc_TypeError,
6373 "'in <string>' requires string as left operand, not %s",
6374 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006375 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006376 }
6377
Thomas Wouters477c8d52006-05-27 19:21:47 +00006378 str = PyUnicode_FromObject(container);
6379 if (!str) {
6380 Py_DECREF(sub);
6381 return -1;
6382 }
6383
6384 result = stringlib_contains_obj(str, sub);
6385
6386 Py_DECREF(str);
6387 Py_DECREF(sub);
6388
Guido van Rossum403d68b2000-03-13 15:55:09 +00006389 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006390}
6391
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392/* Concat to string or Unicode object giving a new Unicode object. */
6393
6394PyObject *PyUnicode_Concat(PyObject *left,
6395 PyObject *right)
6396{
6397 PyUnicodeObject *u = NULL, *v = NULL, *w;
6398
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006399 if (PyBytes_Check(left) || PyBytes_Check(right))
6400 return PyBytes_Concat(left, right);
6401
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 /* Coerce the two arguments */
6403 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6404 if (u == NULL)
6405 goto onError;
6406 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6407 if (v == NULL)
6408 goto onError;
6409
6410 /* Shortcuts */
6411 if (v == unicode_empty) {
6412 Py_DECREF(v);
6413 return (PyObject *)u;
6414 }
6415 if (u == unicode_empty) {
6416 Py_DECREF(u);
6417 return (PyObject *)v;
6418 }
6419
6420 /* Concat the two Unicode strings */
6421 w = _PyUnicode_New(u->length + v->length);
6422 if (w == NULL)
6423 goto onError;
6424 Py_UNICODE_COPY(w->str, u->str, u->length);
6425 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6426
6427 Py_DECREF(u);
6428 Py_DECREF(v);
6429 return (PyObject *)w;
6430
6431onError:
6432 Py_XDECREF(u);
6433 Py_XDECREF(v);
6434 return NULL;
6435}
6436
Walter Dörwald1ab83302007-05-18 17:15:44 +00006437void
6438PyUnicode_Append(PyObject **pleft, PyObject *right)
6439{
6440 PyObject *new;
6441 if (*pleft == NULL)
6442 return;
6443 if (right == NULL || !PyUnicode_Check(*pleft)) {
6444 Py_DECREF(*pleft);
6445 *pleft = NULL;
6446 return;
6447 }
6448 new = PyUnicode_Concat(*pleft, right);
6449 Py_DECREF(*pleft);
6450 *pleft = new;
6451}
6452
6453void
6454PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6455{
6456 PyUnicode_Append(pleft, right);
6457 Py_XDECREF(right);
6458}
6459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006460PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461"S.count(sub[, start[, end]]) -> int\n\
6462\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006463Return the number of non-overlapping occurrences of substring sub in\n\
6464Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006465interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466
6467static PyObject *
6468unicode_count(PyUnicodeObject *self, PyObject *args)
6469{
6470 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006471 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006472 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 PyObject *result;
6474
Guido van Rossumb8872e62000-05-09 14:14:27 +00006475 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6476 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 return NULL;
6478
6479 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006480 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 if (substring == NULL)
6482 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006483
Thomas Wouters477c8d52006-05-27 19:21:47 +00006484 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485
Thomas Wouters477c8d52006-05-27 19:21:47 +00006486 result = PyInt_FromSsize_t(
6487 stringlib_count(self->str + start, end - start,
6488 substring->str, substring->length)
6489 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490
6491 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006492
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 return result;
6494}
6495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006496PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006497"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006499Encodes S using the codec registered for encoding. encoding defaults\n\
6500to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006501handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006502a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6503'xmlcharrefreplace' as well as any other name registered with\n\
6504codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505
6506static PyObject *
6507unicode_encode(PyUnicodeObject *self, PyObject *args)
6508{
6509 char *encoding = NULL;
6510 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006511 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006512
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6514 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006515 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006516 if (v == NULL)
6517 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006518 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006519 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006520 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006521 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006522 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006523 Py_DECREF(v);
6524 return NULL;
6525 }
6526 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006527
6528 onError:
6529 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006530}
6531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006532PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533"S.expandtabs([tabsize]) -> unicode\n\
6534\n\
6535Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006536If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537
6538static PyObject*
6539unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6540{
6541 Py_UNICODE *e;
6542 Py_UNICODE *p;
6543 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006544 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 PyUnicodeObject *u;
6546 int tabsize = 8;
6547
6548 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6549 return NULL;
6550
Thomas Wouters7e474022000-07-16 12:04:32 +00006551 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006552 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 e = self->str + self->length;
6554 for (p = self->str; p < e; p++)
6555 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006556 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006558 if (old_j > j) {
6559 PyErr_SetString(PyExc_OverflowError,
6560 "new string is too long");
6561 return NULL;
6562 }
6563 old_j = j;
6564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 }
6566 else {
6567 j++;
6568 if (*p == '\n' || *p == '\r') {
6569 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006570 old_j = j = 0;
6571 if (i < 0) {
6572 PyErr_SetString(PyExc_OverflowError,
6573 "new string is too long");
6574 return NULL;
6575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 }
6577 }
6578
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006579 if ((i + j) < 0) {
6580 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6581 return NULL;
6582 }
6583
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 /* Second pass: create output string and fill it */
6585 u = _PyUnicode_New(i + j);
6586 if (!u)
6587 return NULL;
6588
6589 j = 0;
6590 q = u->str;
6591
6592 for (p = self->str; p < e; p++)
6593 if (*p == '\t') {
6594 if (tabsize > 0) {
6595 i = tabsize - (j % tabsize);
6596 j += i;
6597 while (i--)
6598 *q++ = ' ';
6599 }
6600 }
6601 else {
6602 j++;
6603 *q++ = *p;
6604 if (*p == '\n' || *p == '\r')
6605 j = 0;
6606 }
6607
6608 return (PyObject*) u;
6609}
6610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006611PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612"S.find(sub [,start [,end]]) -> int\n\
6613\n\
6614Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006615such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616arguments start and end are interpreted as in slice notation.\n\
6617\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006618Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619
6620static PyObject *
6621unicode_find(PyUnicodeObject *self, PyObject *args)
6622{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006623 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006624 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006625 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006626 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627
Guido van Rossumb8872e62000-05-09 14:14:27 +00006628 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6629 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006631 substring = PyUnicode_FromObject(substring);
6632 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 return NULL;
6634
Thomas Wouters477c8d52006-05-27 19:21:47 +00006635 result = stringlib_find_slice(
6636 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6637 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6638 start, end
6639 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640
6641 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006642
6643 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644}
6645
6646static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006647unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648{
6649 if (index < 0 || index >= self->length) {
6650 PyErr_SetString(PyExc_IndexError, "string index out of range");
6651 return NULL;
6652 }
6653
6654 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6655}
6656
Guido van Rossumc2504932007-09-18 19:42:40 +00006657/* Believe it or not, this produces the same value for ASCII strings
6658 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006660unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661{
Guido van Rossumc2504932007-09-18 19:42:40 +00006662 Py_ssize_t len;
6663 Py_UNICODE *p;
6664 long x;
6665
6666 if (self->hash != -1)
6667 return self->hash;
6668 len = Py_Size(self);
6669 p = self->str;
6670 x = *p << 7;
6671 while (--len >= 0)
6672 x = (1000003*x) ^ *p++;
6673 x ^= Py_Size(self);
6674 if (x == -1)
6675 x = -2;
6676 self->hash = x;
6677 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678}
6679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006680PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681"S.index(sub [,start [,end]]) -> int\n\
6682\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006683Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684
6685static PyObject *
6686unicode_index(PyUnicodeObject *self, PyObject *args)
6687{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006688 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006689 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006690 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006691 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692
Guido van Rossumb8872e62000-05-09 14:14:27 +00006693 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6694 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006696 substring = PyUnicode_FromObject(substring);
6697 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 return NULL;
6699
Thomas Wouters477c8d52006-05-27 19:21:47 +00006700 result = stringlib_find_slice(
6701 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6702 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6703 start, end
6704 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705
6706 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006707
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 if (result < 0) {
6709 PyErr_SetString(PyExc_ValueError, "substring not found");
6710 return NULL;
6711 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006712
Martin v. Löwis18e16552006-02-15 17:27:45 +00006713 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714}
6715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006716PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006717"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006719Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006720at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721
6722static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006723unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724{
6725 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6726 register const Py_UNICODE *e;
6727 int cased;
6728
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 /* Shortcut for single character strings */
6730 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006731 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006733 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006734 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006735 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006736
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 e = p + PyUnicode_GET_SIZE(self);
6738 cased = 0;
6739 for (; p < e; p++) {
6740 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006741
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006743 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 else if (!cased && Py_UNICODE_ISLOWER(ch))
6745 cased = 1;
6746 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006747 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748}
6749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006750PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006751"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006753Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006754at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755
6756static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006757unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758{
6759 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6760 register const Py_UNICODE *e;
6761 int cased;
6762
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763 /* Shortcut for single character strings */
6764 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006765 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006767 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006768 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006769 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006770
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 e = p + PyUnicode_GET_SIZE(self);
6772 cased = 0;
6773 for (; p < e; p++) {
6774 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006775
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006777 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 else if (!cased && Py_UNICODE_ISUPPER(ch))
6779 cased = 1;
6780 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006781 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782}
6783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006784PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006785"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006787Return True if S is a titlecased string and there is at least one\n\
6788character in S, i.e. upper- and titlecase characters may only\n\
6789follow uncased characters and lowercase characters only cased ones.\n\
6790Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791
6792static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006793unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794{
6795 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6796 register const Py_UNICODE *e;
6797 int cased, previous_is_cased;
6798
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799 /* Shortcut for single character strings */
6800 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006801 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6802 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006804 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006805 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006806 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006807
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808 e = p + PyUnicode_GET_SIZE(self);
6809 cased = 0;
6810 previous_is_cased = 0;
6811 for (; p < e; p++) {
6812 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006813
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6815 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006816 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817 previous_is_cased = 1;
6818 cased = 1;
6819 }
6820 else if (Py_UNICODE_ISLOWER(ch)) {
6821 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006822 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 previous_is_cased = 1;
6824 cased = 1;
6825 }
6826 else
6827 previous_is_cased = 0;
6828 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006829 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830}
6831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006832PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006833"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006835Return True if all characters in S are whitespace\n\
6836and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837
6838static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006839unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840{
6841 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6842 register const Py_UNICODE *e;
6843
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 /* Shortcut for single character strings */
6845 if (PyUnicode_GET_SIZE(self) == 1 &&
6846 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006847 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006849 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006850 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006851 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006852
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 e = p + PyUnicode_GET_SIZE(self);
6854 for (; p < e; p++) {
6855 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006856 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006858 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859}
6860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006861PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006862"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006863\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006864Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006865and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006866
6867static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006868unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006869{
6870 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6871 register const Py_UNICODE *e;
6872
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006873 /* Shortcut for single character strings */
6874 if (PyUnicode_GET_SIZE(self) == 1 &&
6875 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006876 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006877
6878 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006879 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006880 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006881
6882 e = p + PyUnicode_GET_SIZE(self);
6883 for (; p < e; p++) {
6884 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006885 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006886 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006887 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006888}
6889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006890PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006891"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006892\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006893Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006894and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006895
6896static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006897unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006898{
6899 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6900 register const Py_UNICODE *e;
6901
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006902 /* Shortcut for single character strings */
6903 if (PyUnicode_GET_SIZE(self) == 1 &&
6904 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006905 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006906
6907 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006908 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006909 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006910
6911 e = p + PyUnicode_GET_SIZE(self);
6912 for (; p < e; p++) {
6913 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006914 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006915 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006916 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006917}
6918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006919PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006920"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006922Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006923False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924
6925static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006926unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927{
6928 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6929 register const Py_UNICODE *e;
6930
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 /* Shortcut for single character strings */
6932 if (PyUnicode_GET_SIZE(self) == 1 &&
6933 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006934 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006936 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006937 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006939
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 e = p + PyUnicode_GET_SIZE(self);
6941 for (; p < e; p++) {
6942 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006943 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006945 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946}
6947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006948PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006949"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006951Return True if all characters in S are digits\n\
6952and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953
6954static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006955unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956{
6957 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6958 register const Py_UNICODE *e;
6959
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 /* Shortcut for single character strings */
6961 if (PyUnicode_GET_SIZE(self) == 1 &&
6962 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006963 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006965 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006966 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006967 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006968
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969 e = p + PyUnicode_GET_SIZE(self);
6970 for (; p < e; p++) {
6971 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006972 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006974 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975}
6976
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006977PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006978"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006980Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006981False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982
6983static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006984unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985{
6986 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6987 register const Py_UNICODE *e;
6988
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 /* Shortcut for single character strings */
6990 if (PyUnicode_GET_SIZE(self) == 1 &&
6991 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006992 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006994 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006995 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006996 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006997
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 e = p + PyUnicode_GET_SIZE(self);
6999 for (; p < e; p++) {
7000 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007001 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007003 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004}
7005
Martin v. Löwis47383402007-08-15 07:32:56 +00007006int
7007PyUnicode_IsIdentifier(PyObject *self)
7008{
7009 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7010 register const Py_UNICODE *e;
7011
7012 /* Special case for empty strings */
7013 if (PyUnicode_GET_SIZE(self) == 0)
7014 return 0;
7015
7016 /* PEP 3131 says that the first character must be in
7017 XID_Start and subsequent characters in XID_Continue,
7018 and for the ASCII range, the 2.x rules apply (i.e
7019 start with letters and underscore, continue with
7020 letters, digits, underscore). However, given the current
7021 definition of XID_Start and XID_Continue, it is sufficient
7022 to check just for these, except that _ must be allowed
7023 as starting an identifier. */
7024 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7025 return 0;
7026
7027 e = p + PyUnicode_GET_SIZE(self);
7028 for (p++; p < e; p++) {
7029 if (!_PyUnicode_IsXidContinue(*p))
7030 return 0;
7031 }
7032 return 1;
7033}
7034
7035PyDoc_STRVAR(isidentifier__doc__,
7036"S.isidentifier() -> bool\n\
7037\n\
7038Return True if S is a valid identifier according\n\
7039to the language definition.");
7040
7041static PyObject*
7042unicode_isidentifier(PyObject *self)
7043{
7044 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7045}
7046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007047PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048"S.join(sequence) -> unicode\n\
7049\n\
7050Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007051sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052
7053static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007054unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007056 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057}
7058
Martin v. Löwis18e16552006-02-15 17:27:45 +00007059static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060unicode_length(PyUnicodeObject *self)
7061{
7062 return self->length;
7063}
7064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007065PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007066"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067\n\
7068Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007069done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070
7071static PyObject *
7072unicode_ljust(PyUnicodeObject *self, PyObject *args)
7073{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007074 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007075 Py_UNICODE fillchar = ' ';
7076
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007077 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078 return NULL;
7079
Tim Peters7a29bd52001-09-12 03:03:31 +00007080 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081 Py_INCREF(self);
7082 return (PyObject*) self;
7083 }
7084
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007085 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086}
7087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007088PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089"S.lower() -> unicode\n\
7090\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007091Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092
7093static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007094unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 return fixup(self, fixlower);
7097}
7098
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007099#define LEFTSTRIP 0
7100#define RIGHTSTRIP 1
7101#define BOTHSTRIP 2
7102
7103/* Arrays indexed by above */
7104static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7105
7106#define STRIPNAME(i) (stripformat[i]+3)
7107
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007108/* externally visible for str.strip(unicode) */
7109PyObject *
7110_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7111{
7112 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007113 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007114 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007115 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7116 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007117
Thomas Wouters477c8d52006-05-27 19:21:47 +00007118 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7119
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007120 i = 0;
7121 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007122 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7123 i++;
7124 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007125 }
7126
7127 j = len;
7128 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007129 do {
7130 j--;
7131 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7132 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007133 }
7134
7135 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007136 Py_INCREF(self);
7137 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007138 }
7139 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007140 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007141}
7142
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143
7144static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007145do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007147 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007148 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007149
7150 i = 0;
7151 if (striptype != RIGHTSTRIP) {
7152 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7153 i++;
7154 }
7155 }
7156
7157 j = len;
7158 if (striptype != LEFTSTRIP) {
7159 do {
7160 j--;
7161 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7162 j++;
7163 }
7164
7165 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7166 Py_INCREF(self);
7167 return (PyObject*)self;
7168 }
7169 else
7170 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171}
7172
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007173
7174static PyObject *
7175do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7176{
7177 PyObject *sep = NULL;
7178
7179 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7180 return NULL;
7181
7182 if (sep != NULL && sep != Py_None) {
7183 if (PyUnicode_Check(sep))
7184 return _PyUnicode_XStrip(self, striptype, sep);
7185 else if (PyString_Check(sep)) {
7186 PyObject *res;
7187 sep = PyUnicode_FromObject(sep);
7188 if (sep==NULL)
7189 return NULL;
7190 res = _PyUnicode_XStrip(self, striptype, sep);
7191 Py_DECREF(sep);
7192 return res;
7193 }
7194 else {
7195 PyErr_Format(PyExc_TypeError,
7196 "%s arg must be None, unicode or str",
7197 STRIPNAME(striptype));
7198 return NULL;
7199 }
7200 }
7201
7202 return do_strip(self, striptype);
7203}
7204
7205
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007206PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007207"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007208\n\
7209Return a copy of the string S with leading and trailing\n\
7210whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007211If chars is given and not None, remove characters in chars instead.\n\
7212If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007213
7214static PyObject *
7215unicode_strip(PyUnicodeObject *self, PyObject *args)
7216{
7217 if (PyTuple_GET_SIZE(args) == 0)
7218 return do_strip(self, BOTHSTRIP); /* Common case */
7219 else
7220 return do_argstrip(self, BOTHSTRIP, args);
7221}
7222
7223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007224PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007225"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007226\n\
7227Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007228If chars is given and not None, remove characters in chars instead.\n\
7229If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007230
7231static PyObject *
7232unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7233{
7234 if (PyTuple_GET_SIZE(args) == 0)
7235 return do_strip(self, LEFTSTRIP); /* Common case */
7236 else
7237 return do_argstrip(self, LEFTSTRIP, args);
7238}
7239
7240
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007241PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007242"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007243\n\
7244Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007245If chars is given and not None, remove characters in chars instead.\n\
7246If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007247
7248static PyObject *
7249unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7250{
7251 if (PyTuple_GET_SIZE(args) == 0)
7252 return do_strip(self, RIGHTSTRIP); /* Common case */
7253 else
7254 return do_argstrip(self, RIGHTSTRIP, args);
7255}
7256
7257
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007259unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260{
7261 PyUnicodeObject *u;
7262 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007263 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007264 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265
7266 if (len < 0)
7267 len = 0;
7268
Tim Peters7a29bd52001-09-12 03:03:31 +00007269 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270 /* no repeat, return original string */
7271 Py_INCREF(str);
7272 return (PyObject*) str;
7273 }
Tim Peters8f422462000-09-09 06:13:41 +00007274
7275 /* ensure # of chars needed doesn't overflow int and # of bytes
7276 * needed doesn't overflow size_t
7277 */
7278 nchars = len * str->length;
7279 if (len && nchars / len != str->length) {
7280 PyErr_SetString(PyExc_OverflowError,
7281 "repeated string is too long");
7282 return NULL;
7283 }
7284 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7285 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7286 PyErr_SetString(PyExc_OverflowError,
7287 "repeated string is too long");
7288 return NULL;
7289 }
7290 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291 if (!u)
7292 return NULL;
7293
7294 p = u->str;
7295
Thomas Wouters477c8d52006-05-27 19:21:47 +00007296 if (str->length == 1 && len > 0) {
7297 Py_UNICODE_FILL(p, str->str[0], len);
7298 } else {
7299 Py_ssize_t done = 0; /* number of characters copied this far */
7300 if (done < nchars) {
7301 Py_UNICODE_COPY(p, str->str, str->length);
7302 done = str->length;
7303 }
7304 while (done < nchars) {
7305 int n = (done <= nchars-done) ? done : nchars-done;
7306 Py_UNICODE_COPY(p+done, p, n);
7307 done += n;
7308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309 }
7310
7311 return (PyObject*) u;
7312}
7313
7314PyObject *PyUnicode_Replace(PyObject *obj,
7315 PyObject *subobj,
7316 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007317 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318{
7319 PyObject *self;
7320 PyObject *str1;
7321 PyObject *str2;
7322 PyObject *result;
7323
7324 self = PyUnicode_FromObject(obj);
7325 if (self == NULL)
7326 return NULL;
7327 str1 = PyUnicode_FromObject(subobj);
7328 if (str1 == NULL) {
7329 Py_DECREF(self);
7330 return NULL;
7331 }
7332 str2 = PyUnicode_FromObject(replobj);
7333 if (str2 == NULL) {
7334 Py_DECREF(self);
7335 Py_DECREF(str1);
7336 return NULL;
7337 }
Tim Petersced69f82003-09-16 20:30:58 +00007338 result = replace((PyUnicodeObject *)self,
7339 (PyUnicodeObject *)str1,
7340 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341 maxcount);
7342 Py_DECREF(self);
7343 Py_DECREF(str1);
7344 Py_DECREF(str2);
7345 return result;
7346}
7347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007348PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349"S.replace (old, new[, maxsplit]) -> unicode\n\
7350\n\
7351Return a copy of S with all occurrences of substring\n\
7352old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007353given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354
7355static PyObject*
7356unicode_replace(PyUnicodeObject *self, PyObject *args)
7357{
7358 PyUnicodeObject *str1;
7359 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007360 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361 PyObject *result;
7362
Martin v. Löwis18e16552006-02-15 17:27:45 +00007363 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364 return NULL;
7365 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7366 if (str1 == NULL)
7367 return NULL;
7368 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007369 if (str2 == NULL) {
7370 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
7374 result = replace(self, str1, str2, maxcount);
7375
7376 Py_DECREF(str1);
7377 Py_DECREF(str2);
7378 return result;
7379}
7380
7381static
7382PyObject *unicode_repr(PyObject *unicode)
7383{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007384 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007385 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007386 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7387 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7388
7389 /* XXX(nnorwitz): rather than over-allocating, it would be
7390 better to choose a different scheme. Perhaps scan the
7391 first N-chars of the string and allocate based on that size.
7392 */
7393 /* Initial allocation is based on the longest-possible unichr
7394 escape.
7395
7396 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7397 unichr, so in this case it's the longest unichr escape. In
7398 narrow (UTF-16) builds this is five chars per source unichr
7399 since there are two unichrs in the surrogate pair, so in narrow
7400 (UTF-16) builds it's not the longest unichr escape.
7401
7402 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7403 so in the narrow (UTF-16) build case it's the longest unichr
7404 escape.
7405 */
7406
Walter Dörwald1ab83302007-05-18 17:15:44 +00007407 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007408 2 /* quotes */
7409#ifdef Py_UNICODE_WIDE
7410 + 10*size
7411#else
7412 + 6*size
7413#endif
7414 + 1);
7415 if (repr == NULL)
7416 return NULL;
7417
Walter Dörwald1ab83302007-05-18 17:15:44 +00007418 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007419
7420 /* Add quote */
7421 *p++ = (findchar(s, size, '\'') &&
7422 !findchar(s, size, '"')) ? '"' : '\'';
7423 while (size-- > 0) {
7424 Py_UNICODE ch = *s++;
7425
7426 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007427 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007428 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007429 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007430 continue;
7431 }
7432
7433#ifdef Py_UNICODE_WIDE
7434 /* Map 21-bit characters to '\U00xxxxxx' */
7435 else if (ch >= 0x10000) {
7436 *p++ = '\\';
7437 *p++ = 'U';
7438 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7439 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7440 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7441 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7442 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7443 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7444 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7445 *p++ = hexdigits[ch & 0x0000000F];
7446 continue;
7447 }
7448#else
7449 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7450 else if (ch >= 0xD800 && ch < 0xDC00) {
7451 Py_UNICODE ch2;
7452 Py_UCS4 ucs;
7453
7454 ch2 = *s++;
7455 size--;
7456 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7457 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7458 *p++ = '\\';
7459 *p++ = 'U';
7460 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7461 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7462 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7463 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7464 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7465 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7466 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7467 *p++ = hexdigits[ucs & 0x0000000F];
7468 continue;
7469 }
7470 /* Fall through: isolated surrogates are copied as-is */
7471 s--;
7472 size++;
7473 }
7474#endif
7475
7476 /* Map 16-bit characters to '\uxxxx' */
7477 if (ch >= 256) {
7478 *p++ = '\\';
7479 *p++ = 'u';
7480 *p++ = hexdigits[(ch >> 12) & 0x000F];
7481 *p++ = hexdigits[(ch >> 8) & 0x000F];
7482 *p++ = hexdigits[(ch >> 4) & 0x000F];
7483 *p++ = hexdigits[ch & 0x000F];
7484 }
7485
7486 /* Map special whitespace to '\t', \n', '\r' */
7487 else if (ch == '\t') {
7488 *p++ = '\\';
7489 *p++ = 't';
7490 }
7491 else if (ch == '\n') {
7492 *p++ = '\\';
7493 *p++ = 'n';
7494 }
7495 else if (ch == '\r') {
7496 *p++ = '\\';
7497 *p++ = 'r';
7498 }
7499
7500 /* Map non-printable US ASCII to '\xhh' */
7501 else if (ch < ' ' || ch >= 0x7F) {
7502 *p++ = '\\';
7503 *p++ = 'x';
7504 *p++ = hexdigits[(ch >> 4) & 0x000F];
7505 *p++ = hexdigits[ch & 0x000F];
7506 }
7507
7508 /* Copy everything else as-is */
7509 else
7510 *p++ = (char) ch;
7511 }
7512 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007513 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007514
7515 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007516 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007517 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518}
7519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007520PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521"S.rfind(sub [,start [,end]]) -> int\n\
7522\n\
7523Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007524such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525arguments start and end are interpreted as in slice notation.\n\
7526\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007527Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528
7529static PyObject *
7530unicode_rfind(PyUnicodeObject *self, PyObject *args)
7531{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007532 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007533 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007534 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007535 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536
Guido van Rossumb8872e62000-05-09 14:14:27 +00007537 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7538 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007540 substring = PyUnicode_FromObject(substring);
7541 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542 return NULL;
7543
Thomas Wouters477c8d52006-05-27 19:21:47 +00007544 result = stringlib_rfind_slice(
7545 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7546 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7547 start, end
7548 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549
7550 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007551
7552 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553}
7554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007555PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556"S.rindex(sub [,start [,end]]) -> int\n\
7557\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007558Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559
7560static PyObject *
7561unicode_rindex(PyUnicodeObject *self, PyObject *args)
7562{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007563 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007564 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007565 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007566 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567
Guido van Rossumb8872e62000-05-09 14:14:27 +00007568 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7569 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007571 substring = PyUnicode_FromObject(substring);
7572 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573 return NULL;
7574
Thomas Wouters477c8d52006-05-27 19:21:47 +00007575 result = stringlib_rfind_slice(
7576 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7577 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7578 start, end
7579 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580
7581 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007582
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 if (result < 0) {
7584 PyErr_SetString(PyExc_ValueError, "substring not found");
7585 return NULL;
7586 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007587 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588}
7589
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007590PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007591"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592\n\
7593Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007594done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595
7596static PyObject *
7597unicode_rjust(PyUnicodeObject *self, PyObject *args)
7598{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007599 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007600 Py_UNICODE fillchar = ' ';
7601
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007602 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603 return NULL;
7604
Tim Peters7a29bd52001-09-12 03:03:31 +00007605 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 Py_INCREF(self);
7607 return (PyObject*) self;
7608 }
7609
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007610 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611}
7612
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613PyObject *PyUnicode_Split(PyObject *s,
7614 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007615 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616{
7617 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007618
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619 s = PyUnicode_FromObject(s);
7620 if (s == NULL)
7621 return NULL;
7622 if (sep != NULL) {
7623 sep = PyUnicode_FromObject(sep);
7624 if (sep == NULL) {
7625 Py_DECREF(s);
7626 return NULL;
7627 }
7628 }
7629
7630 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7631
7632 Py_DECREF(s);
7633 Py_XDECREF(sep);
7634 return result;
7635}
7636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007637PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638"S.split([sep [,maxsplit]]) -> list of strings\n\
7639\n\
7640Return a list of the words in S, using sep as the\n\
7641delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007642splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007643any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644
7645static PyObject*
7646unicode_split(PyUnicodeObject *self, PyObject *args)
7647{
7648 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007649 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650
Martin v. Löwis18e16552006-02-15 17:27:45 +00007651 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652 return NULL;
7653
7654 if (substring == Py_None)
7655 return split(self, NULL, maxcount);
7656 else if (PyUnicode_Check(substring))
7657 return split(self, (PyUnicodeObject *)substring, maxcount);
7658 else
7659 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7660}
7661
Thomas Wouters477c8d52006-05-27 19:21:47 +00007662PyObject *
7663PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7664{
7665 PyObject* str_obj;
7666 PyObject* sep_obj;
7667 PyObject* out;
7668
7669 str_obj = PyUnicode_FromObject(str_in);
7670 if (!str_obj)
7671 return NULL;
7672 sep_obj = PyUnicode_FromObject(sep_in);
7673 if (!sep_obj) {
7674 Py_DECREF(str_obj);
7675 return NULL;
7676 }
7677
7678 out = stringlib_partition(
7679 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7680 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7681 );
7682
7683 Py_DECREF(sep_obj);
7684 Py_DECREF(str_obj);
7685
7686 return out;
7687}
7688
7689
7690PyObject *
7691PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7692{
7693 PyObject* str_obj;
7694 PyObject* sep_obj;
7695 PyObject* out;
7696
7697 str_obj = PyUnicode_FromObject(str_in);
7698 if (!str_obj)
7699 return NULL;
7700 sep_obj = PyUnicode_FromObject(sep_in);
7701 if (!sep_obj) {
7702 Py_DECREF(str_obj);
7703 return NULL;
7704 }
7705
7706 out = stringlib_rpartition(
7707 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7708 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7709 );
7710
7711 Py_DECREF(sep_obj);
7712 Py_DECREF(str_obj);
7713
7714 return out;
7715}
7716
7717PyDoc_STRVAR(partition__doc__,
7718"S.partition(sep) -> (head, sep, tail)\n\
7719\n\
7720Searches for the separator sep in S, and returns the part before it,\n\
7721the separator itself, and the part after it. If the separator is not\n\
7722found, returns S and two empty strings.");
7723
7724static PyObject*
7725unicode_partition(PyUnicodeObject *self, PyObject *separator)
7726{
7727 return PyUnicode_Partition((PyObject *)self, separator);
7728}
7729
7730PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007731"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007732\n\
7733Searches for the separator sep in S, starting at the end of S, and returns\n\
7734the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007735separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007736
7737static PyObject*
7738unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7739{
7740 return PyUnicode_RPartition((PyObject *)self, separator);
7741}
7742
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007743PyObject *PyUnicode_RSplit(PyObject *s,
7744 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007745 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007746{
7747 PyObject *result;
7748
7749 s = PyUnicode_FromObject(s);
7750 if (s == NULL)
7751 return NULL;
7752 if (sep != NULL) {
7753 sep = PyUnicode_FromObject(sep);
7754 if (sep == NULL) {
7755 Py_DECREF(s);
7756 return NULL;
7757 }
7758 }
7759
7760 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7761
7762 Py_DECREF(s);
7763 Py_XDECREF(sep);
7764 return result;
7765}
7766
7767PyDoc_STRVAR(rsplit__doc__,
7768"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7769\n\
7770Return a list of the words in S, using sep as the\n\
7771delimiter string, starting at the end of the string and\n\
7772working to the front. If maxsplit is given, at most maxsplit\n\
7773splits are done. If sep is not specified, any whitespace string\n\
7774is a separator.");
7775
7776static PyObject*
7777unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7778{
7779 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007780 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007781
Martin v. Löwis18e16552006-02-15 17:27:45 +00007782 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007783 return NULL;
7784
7785 if (substring == Py_None)
7786 return rsplit(self, NULL, maxcount);
7787 else if (PyUnicode_Check(substring))
7788 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7789 else
7790 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7791}
7792
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007793PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007794"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795\n\
7796Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007797Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007798is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799
7800static PyObject*
7801unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7802{
Guido van Rossum86662912000-04-11 15:38:46 +00007803 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804
Guido van Rossum86662912000-04-11 15:38:46 +00007805 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806 return NULL;
7807
Guido van Rossum86662912000-04-11 15:38:46 +00007808 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809}
7810
7811static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007812PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813{
Walter Dörwald346737f2007-05-31 10:44:43 +00007814 if (PyUnicode_CheckExact(self)) {
7815 Py_INCREF(self);
7816 return self;
7817 } else
7818 /* Subtype -- return genuine unicode string with the same value. */
7819 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7820 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821}
7822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007823PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824"S.swapcase() -> unicode\n\
7825\n\
7826Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007827and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828
7829static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007830unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 return fixup(self, fixswapcase);
7833}
7834
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007835PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836"S.translate(table) -> unicode\n\
7837\n\
7838Return a copy of the string S, where all characters have been mapped\n\
7839through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007840Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7841Unmapped characters are left untouched. Characters mapped to None\n\
7842are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843
7844static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007845unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846{
Georg Brandl94c2c752007-10-23 06:52:59 +00007847 PyObject *newtable = NULL;
7848 Py_ssize_t i = 0;
7849 PyObject *key, *value, *result;
7850
7851 if (!PyDict_Check(table)) {
7852 PyErr_SetString(PyExc_TypeError, "translate argument must be a dict");
7853 return NULL;
7854 }
7855 /* fixup the table -- allow size-1 string keys instead of only int keys */
7856 newtable = PyDict_Copy(table);
7857 if (!newtable) return NULL;
7858 while (PyDict_Next(table, &i, &key, &value)) {
7859 if (PyUnicode_Check(key)) {
7860 /* convert string keys to integer keys */
7861 PyObject *newkey;
7862 int res;
7863 if (PyUnicode_GET_SIZE(key) != 1) {
7864 PyErr_SetString(PyExc_ValueError, "string items in translate "
7865 "table must be 1 element long");
7866 goto err;
7867 }
7868 newkey = PyInt_FromLong(PyUnicode_AS_UNICODE(key)[0]);
7869 if (!newkey)
7870 goto err;
7871 res = PyDict_SetItem(newtable, newkey, value);
7872 Py_DECREF(newkey);
7873 if (res < 0)
7874 goto err;
7875 } else if (PyInt_Check(key)) {
7876 /* just keep integer keys */
7877 if (PyDict_SetItem(newtable, key, value) < 0)
7878 goto err;
7879 } else {
7880 PyErr_SetString(PyExc_TypeError, "items in translate table must be "
7881 "strings or integers");
7882 goto err;
7883 }
7884 }
7885
7886 result = PyUnicode_TranslateCharmap(self->str,
7887 self->length,
7888 newtable,
7889 "ignore");
7890 Py_DECREF(newtable);
7891 return result;
7892 err:
7893 Py_DECREF(newtable);
7894 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895}
7896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007897PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898"S.upper() -> unicode\n\
7899\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007900Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901
7902static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007903unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905 return fixup(self, fixupper);
7906}
7907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007908PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909"S.zfill(width) -> unicode\n\
7910\n\
7911Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007912of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913
7914static PyObject *
7915unicode_zfill(PyUnicodeObject *self, PyObject *args)
7916{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007917 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918 PyUnicodeObject *u;
7919
Martin v. Löwis18e16552006-02-15 17:27:45 +00007920 Py_ssize_t width;
7921 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922 return NULL;
7923
7924 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007925 if (PyUnicode_CheckExact(self)) {
7926 Py_INCREF(self);
7927 return (PyObject*) self;
7928 }
7929 else
7930 return PyUnicode_FromUnicode(
7931 PyUnicode_AS_UNICODE(self),
7932 PyUnicode_GET_SIZE(self)
7933 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007934 }
7935
7936 fill = width - self->length;
7937
7938 u = pad(self, fill, 0, '0');
7939
Walter Dörwald068325e2002-04-15 13:36:47 +00007940 if (u == NULL)
7941 return NULL;
7942
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943 if (u->str[fill] == '+' || u->str[fill] == '-') {
7944 /* move sign to beginning of string */
7945 u->str[0] = u->str[fill];
7946 u->str[fill] = '0';
7947 }
7948
7949 return (PyObject*) u;
7950}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951
7952#if 0
7953static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007954unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956 return PyInt_FromLong(unicode_freelist_size);
7957}
7958#endif
7959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007960PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007961"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007963Return True if S starts with the specified prefix, False otherwise.\n\
7964With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007965With optional end, stop comparing S at that position.\n\
7966prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967
7968static PyObject *
7969unicode_startswith(PyUnicodeObject *self,
7970 PyObject *args)
7971{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007972 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007974 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007975 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007976 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007978 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007979 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007981 if (PyTuple_Check(subobj)) {
7982 Py_ssize_t i;
7983 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7984 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7985 PyTuple_GET_ITEM(subobj, i));
7986 if (substring == NULL)
7987 return NULL;
7988 result = tailmatch(self, substring, start, end, -1);
7989 Py_DECREF(substring);
7990 if (result) {
7991 Py_RETURN_TRUE;
7992 }
7993 }
7994 /* nothing matched */
7995 Py_RETURN_FALSE;
7996 }
7997 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007999 return NULL;
8000 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008002 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003}
8004
8005
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008006PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008007"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008009Return True if S ends with the specified suffix, False otherwise.\n\
8010With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008011With optional end, stop comparing S at that position.\n\
8012suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013
8014static PyObject *
8015unicode_endswith(PyUnicodeObject *self,
8016 PyObject *args)
8017{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008018 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008020 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008021 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008022 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008024 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8025 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008027 if (PyTuple_Check(subobj)) {
8028 Py_ssize_t i;
8029 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8030 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8031 PyTuple_GET_ITEM(subobj, i));
8032 if (substring == NULL)
8033 return NULL;
8034 result = tailmatch(self, substring, start, end, +1);
8035 Py_DECREF(substring);
8036 if (result) {
8037 Py_RETURN_TRUE;
8038 }
8039 }
8040 Py_RETURN_FALSE;
8041 }
8042 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008046 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008048 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049}
8050
Eric Smith8c663262007-08-25 02:26:07 +00008051#include "stringlib/string_format.h"
8052
8053PyDoc_STRVAR(format__doc__,
8054"S.format(*args, **kwargs) -> unicode\n\
8055\n\
8056");
8057
Eric Smith8c663262007-08-25 02:26:07 +00008058PyDoc_STRVAR(p_format__doc__,
8059"S.__format__(format_spec) -> unicode\n\
8060\n\
8061");
8062
8063static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008064unicode_getnewargs(PyUnicodeObject *v)
8065{
8066 return Py_BuildValue("(u#)", v->str, v->length);
8067}
8068
8069
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070static PyMethodDef unicode_methods[] = {
8071
8072 /* Order is according to common usage: often used methods should
8073 appear first, since lookup is done sequentially. */
8074
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008075 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8076 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8077 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008078 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008079 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8080 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8081 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8082 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8083 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8084 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8085 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008086 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008087 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8088 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8089 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008090 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008091 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8092 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8093 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008094 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008095 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008096 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008097 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008098 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8099 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8100 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8101 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8102 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8103 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8104 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8105 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8106 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8107 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8108 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8109 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8110 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8111 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008112 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008113 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008114 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8115 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008116 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8117 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00008118#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008119 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120#endif
8121
8122#if 0
8123 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008124 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125#endif
8126
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008127 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 {NULL, NULL}
8129};
8130
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008131static PyObject *
8132unicode_mod(PyObject *v, PyObject *w)
8133{
8134 if (!PyUnicode_Check(v)) {
8135 Py_INCREF(Py_NotImplemented);
8136 return Py_NotImplemented;
8137 }
8138 return PyUnicode_Format(v, w);
8139}
8140
8141static PyNumberMethods unicode_as_number = {
8142 0, /*nb_add*/
8143 0, /*nb_subtract*/
8144 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008145 unicode_mod, /*nb_remainder*/
8146};
8147
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008149 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008150 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008151 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8152 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008153 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154 0, /* sq_ass_item */
8155 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008156 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157};
8158
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008159static PyObject*
8160unicode_subscript(PyUnicodeObject* self, PyObject* item)
8161{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008162 if (PyIndex_Check(item)) {
8163 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008164 if (i == -1 && PyErr_Occurred())
8165 return NULL;
8166 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008167 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008168 return unicode_getitem(self, i);
8169 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008170 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008171 Py_UNICODE* source_buf;
8172 Py_UNICODE* result_buf;
8173 PyObject* result;
8174
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008175 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008176 &start, &stop, &step, &slicelength) < 0) {
8177 return NULL;
8178 }
8179
8180 if (slicelength <= 0) {
8181 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008182 } else if (start == 0 && step == 1 && slicelength == self->length &&
8183 PyUnicode_CheckExact(self)) {
8184 Py_INCREF(self);
8185 return (PyObject *)self;
8186 } else if (step == 1) {
8187 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008188 } else {
8189 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008190 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8191 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008192
8193 if (result_buf == NULL)
8194 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008195
8196 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8197 result_buf[i] = source_buf[cur];
8198 }
Tim Petersced69f82003-09-16 20:30:58 +00008199
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008200 result = PyUnicode_FromUnicode(result_buf, slicelength);
8201 PyMem_FREE(result_buf);
8202 return result;
8203 }
8204 } else {
8205 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8206 return NULL;
8207 }
8208}
8209
8210static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008211 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008212 (binaryfunc)unicode_subscript, /* mp_subscript */
8213 (objobjargproc)0, /* mp_ass_subscript */
8214};
8215
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217/* Helpers for PyUnicode_Format() */
8218
8219static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008220getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008222 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223 if (argidx < arglen) {
8224 (*p_argidx)++;
8225 if (arglen < 0)
8226 return args;
8227 else
8228 return PyTuple_GetItem(args, argidx);
8229 }
8230 PyErr_SetString(PyExc_TypeError,
8231 "not enough arguments for format string");
8232 return NULL;
8233}
8234
8235#define F_LJUST (1<<0)
8236#define F_SIGN (1<<1)
8237#define F_BLANK (1<<2)
8238#define F_ALT (1<<3)
8239#define F_ZERO (1<<4)
8240
Martin v. Löwis18e16552006-02-15 17:27:45 +00008241static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008242strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008244 register Py_ssize_t i;
8245 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 for (i = len - 1; i >= 0; i--)
8247 buffer[i] = (Py_UNICODE) charbuffer[i];
8248
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249 return len;
8250}
8251
Neal Norwitzfc76d632006-01-10 06:03:13 +00008252static int
8253doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8254{
Tim Peters15231542006-02-16 01:08:01 +00008255 Py_ssize_t result;
8256
Neal Norwitzfc76d632006-01-10 06:03:13 +00008257 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008258 result = strtounicode(buffer, (char *)buffer);
8259 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008260}
8261
8262static int
8263longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8264{
Tim Peters15231542006-02-16 01:08:01 +00008265 Py_ssize_t result;
8266
Neal Norwitzfc76d632006-01-10 06:03:13 +00008267 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008268 result = strtounicode(buffer, (char *)buffer);
8269 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008270}
8271
Guido van Rossum078151d2002-08-11 04:24:12 +00008272/* XXX To save some code duplication, formatfloat/long/int could have been
8273 shared with stringobject.c, converting from 8-bit to Unicode after the
8274 formatting is done. */
8275
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276static int
8277formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008278 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 int flags,
8280 int prec,
8281 int type,
8282 PyObject *v)
8283{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008284 /* fmt = '%#.' + `prec` + `type`
8285 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 char fmt[20];
8287 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008288
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 x = PyFloat_AsDouble(v);
8290 if (x == -1.0 && PyErr_Occurred())
8291 return -1;
8292 if (prec < 0)
8293 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8295 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008296 /* Worst case length calc to ensure no buffer overrun:
8297
8298 'g' formats:
8299 fmt = %#.<prec>g
8300 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8301 for any double rep.)
8302 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8303
8304 'f' formats:
8305 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8306 len = 1 + 50 + 1 + prec = 52 + prec
8307
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008308 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008309 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008310
8311 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008312 if (((type == 'g' || type == 'G') &&
8313 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008314 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008315 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008316 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008317 return -1;
8318 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008319 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8320 (flags&F_ALT) ? "#" : "",
8321 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008322 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323}
8324
Tim Peters38fd5b62000-09-21 05:43:11 +00008325static PyObject*
8326formatlong(PyObject *val, int flags, int prec, int type)
8327{
8328 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008329 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008330 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008331 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008332
8333 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8334 if (!str)
8335 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008336 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008337 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008338 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008339}
8340
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341static int
8342formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008343 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344 int flags,
8345 int prec,
8346 int type,
8347 PyObject *v)
8348{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008349 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008350 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8351 * + 1 + 1
8352 * = 24
8353 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008354 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008355 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356 long x;
8357
8358 x = PyInt_AsLong(v);
8359 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008360 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008361 if (x < 0 && type == 'u') {
8362 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008363 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008364 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8365 sign = "-";
8366 else
8367 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008369 prec = 1;
8370
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008371 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8372 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008373 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008374 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008375 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008376 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008377 return -1;
8378 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008379
8380 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008381 (type == 'x' || type == 'X' || type == 'o')) {
8382 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008383 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008384 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008385 * - when 0 is being converted, the C standard leaves off
8386 * the '0x' or '0X', which is inconsistent with other
8387 * %#x/%#X conversions and inconsistent with Python's
8388 * hex() function
8389 * - there are platforms that violate the standard and
8390 * convert 0 with the '0x' or '0X'
8391 * (Metrowerks, Compaq Tru64)
8392 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008393 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008394 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008395 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008396 * We can achieve the desired consistency by inserting our
8397 * own '0x' or '0X' prefix, and substituting %x/%X in place
8398 * of %#x/%#X.
8399 *
8400 * Note that this is the same approach as used in
8401 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008402 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008403 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8404 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008405 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008406 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008407 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8408 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008409 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008410 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008411 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008412 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008413 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008414 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415}
8416
8417static int
8418formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008419 size_t buflen,
8420 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008422 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008423 if (PyUnicode_Check(v)) {
8424 if (PyUnicode_GET_SIZE(v) != 1)
8425 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008427 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008429 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008430 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008431 goto onError;
8432 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8433 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434
8435 else {
8436 /* Integer input truncated to a character */
8437 long x;
8438 x = PyInt_AsLong(v);
8439 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008440 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008441#ifdef Py_UNICODE_WIDE
8442 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008443 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008444 "%c arg not in range(0x110000) "
8445 "(wide Python build)");
8446 return -1;
8447 }
8448#else
8449 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008450 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008451 "%c arg not in range(0x10000) "
8452 "(narrow Python build)");
8453 return -1;
8454 }
8455#endif
8456 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457 }
8458 buf[1] = '\0';
8459 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008460
8461 onError:
8462 PyErr_SetString(PyExc_TypeError,
8463 "%c requires int or char");
8464 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465}
8466
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008467/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8468
8469 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8470 chars are formatted. XXX This is a magic number. Each formatting
8471 routine does bounds checking to ensure no overflow, but a better
8472 solution may be to malloc a buffer of appropriate size for each
8473 format. For now, the current solution is sufficient.
8474*/
8475#define FORMATBUFLEN (size_t)120
8476
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477PyObject *PyUnicode_Format(PyObject *format,
8478 PyObject *args)
8479{
8480 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008481 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482 int args_owned = 0;
8483 PyUnicodeObject *result = NULL;
8484 PyObject *dict = NULL;
8485 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008486
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 if (format == NULL || args == NULL) {
8488 PyErr_BadInternalCall();
8489 return NULL;
8490 }
8491 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008492 if (uformat == NULL)
8493 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494 fmt = PyUnicode_AS_UNICODE(uformat);
8495 fmtcnt = PyUnicode_GET_SIZE(uformat);
8496
8497 reslen = rescnt = fmtcnt + 100;
8498 result = _PyUnicode_New(reslen);
8499 if (result == NULL)
8500 goto onError;
8501 res = PyUnicode_AS_UNICODE(result);
8502
8503 if (PyTuple_Check(args)) {
8504 arglen = PyTuple_Size(args);
8505 argidx = 0;
8506 }
8507 else {
8508 arglen = -1;
8509 argidx = -2;
8510 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008511 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Guido van Rossum3172c5d2007-10-16 18:12:55 +00008512 !PyString_Check(args) && !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513 dict = args;
8514
8515 while (--fmtcnt >= 0) {
8516 if (*fmt != '%') {
8517 if (--rescnt < 0) {
8518 rescnt = fmtcnt + 100;
8519 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008520 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008521 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8523 --rescnt;
8524 }
8525 *res++ = *fmt++;
8526 }
8527 else {
8528 /* Got a format specifier */
8529 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008530 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532 Py_UNICODE c = '\0';
8533 Py_UNICODE fill;
8534 PyObject *v = NULL;
8535 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008536 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008538 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008539 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540
8541 fmt++;
8542 if (*fmt == '(') {
8543 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008544 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545 PyObject *key;
8546 int pcount = 1;
8547
8548 if (dict == NULL) {
8549 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008550 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 goto onError;
8552 }
8553 ++fmt;
8554 --fmtcnt;
8555 keystart = fmt;
8556 /* Skip over balanced parentheses */
8557 while (pcount > 0 && --fmtcnt >= 0) {
8558 if (*fmt == ')')
8559 --pcount;
8560 else if (*fmt == '(')
8561 ++pcount;
8562 fmt++;
8563 }
8564 keylen = fmt - keystart - 1;
8565 if (fmtcnt < 0 || pcount > 0) {
8566 PyErr_SetString(PyExc_ValueError,
8567 "incomplete format key");
8568 goto onError;
8569 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008570#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008571 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572 then looked up since Python uses strings to hold
8573 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008574 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575 key = PyUnicode_EncodeUTF8(keystart,
8576 keylen,
8577 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008578#else
8579 key = PyUnicode_FromUnicode(keystart, keylen);
8580#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581 if (key == NULL)
8582 goto onError;
8583 if (args_owned) {
8584 Py_DECREF(args);
8585 args_owned = 0;
8586 }
8587 args = PyObject_GetItem(dict, key);
8588 Py_DECREF(key);
8589 if (args == NULL) {
8590 goto onError;
8591 }
8592 args_owned = 1;
8593 arglen = -1;
8594 argidx = -2;
8595 }
8596 while (--fmtcnt >= 0) {
8597 switch (c = *fmt++) {
8598 case '-': flags |= F_LJUST; continue;
8599 case '+': flags |= F_SIGN; continue;
8600 case ' ': flags |= F_BLANK; continue;
8601 case '#': flags |= F_ALT; continue;
8602 case '0': flags |= F_ZERO; continue;
8603 }
8604 break;
8605 }
8606 if (c == '*') {
8607 v = getnextarg(args, arglen, &argidx);
8608 if (v == NULL)
8609 goto onError;
8610 if (!PyInt_Check(v)) {
8611 PyErr_SetString(PyExc_TypeError,
8612 "* wants int");
8613 goto onError;
8614 }
8615 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008616 if (width == -1 && PyErr_Occurred())
8617 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 if (width < 0) {
8619 flags |= F_LJUST;
8620 width = -width;
8621 }
8622 if (--fmtcnt >= 0)
8623 c = *fmt++;
8624 }
8625 else if (c >= '0' && c <= '9') {
8626 width = c - '0';
8627 while (--fmtcnt >= 0) {
8628 c = *fmt++;
8629 if (c < '0' || c > '9')
8630 break;
8631 if ((width*10) / 10 != width) {
8632 PyErr_SetString(PyExc_ValueError,
8633 "width too big");
8634 goto onError;
8635 }
8636 width = width*10 + (c - '0');
8637 }
8638 }
8639 if (c == '.') {
8640 prec = 0;
8641 if (--fmtcnt >= 0)
8642 c = *fmt++;
8643 if (c == '*') {
8644 v = getnextarg(args, arglen, &argidx);
8645 if (v == NULL)
8646 goto onError;
8647 if (!PyInt_Check(v)) {
8648 PyErr_SetString(PyExc_TypeError,
8649 "* wants int");
8650 goto onError;
8651 }
8652 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008653 if (prec == -1 && PyErr_Occurred())
8654 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 if (prec < 0)
8656 prec = 0;
8657 if (--fmtcnt >= 0)
8658 c = *fmt++;
8659 }
8660 else if (c >= '0' && c <= '9') {
8661 prec = c - '0';
8662 while (--fmtcnt >= 0) {
8663 c = Py_CHARMASK(*fmt++);
8664 if (c < '0' || c > '9')
8665 break;
8666 if ((prec*10) / 10 != prec) {
8667 PyErr_SetString(PyExc_ValueError,
8668 "prec too big");
8669 goto onError;
8670 }
8671 prec = prec*10 + (c - '0');
8672 }
8673 }
8674 } /* prec */
8675 if (fmtcnt >= 0) {
8676 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677 if (--fmtcnt >= 0)
8678 c = *fmt++;
8679 }
8680 }
8681 if (fmtcnt < 0) {
8682 PyErr_SetString(PyExc_ValueError,
8683 "incomplete format");
8684 goto onError;
8685 }
8686 if (c != '%') {
8687 v = getnextarg(args, arglen, &argidx);
8688 if (v == NULL)
8689 goto onError;
8690 }
8691 sign = 0;
8692 fill = ' ';
8693 switch (c) {
8694
8695 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008696 pbuf = formatbuf;
8697 /* presume that buffer length is at least 1 */
8698 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699 len = 1;
8700 break;
8701
8702 case 's':
8703 case 'r':
8704 if (PyUnicode_Check(v) && c == 's') {
8705 temp = v;
8706 Py_INCREF(temp);
8707 }
8708 else {
8709 PyObject *unicode;
8710 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008711 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 else
8713 temp = PyObject_Repr(v);
8714 if (temp == NULL)
8715 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008716 if (PyUnicode_Check(temp))
8717 /* nothing to do */;
8718 else if (PyString_Check(temp)) {
8719 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008720 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008722 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008724 Py_DECREF(temp);
8725 temp = unicode;
8726 if (temp == NULL)
8727 goto onError;
8728 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008729 else {
8730 Py_DECREF(temp);
8731 PyErr_SetString(PyExc_TypeError,
8732 "%s argument has non-string str()");
8733 goto onError;
8734 }
8735 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008736 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737 len = PyUnicode_GET_SIZE(temp);
8738 if (prec >= 0 && len > prec)
8739 len = prec;
8740 break;
8741
8742 case 'i':
8743 case 'd':
8744 case 'u':
8745 case 'o':
8746 case 'x':
8747 case 'X':
8748 if (c == 'i')
8749 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008750 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008751 temp = formatlong(v, flags, prec, c);
8752 if (!temp)
8753 goto onError;
8754 pbuf = PyUnicode_AS_UNICODE(temp);
8755 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008756 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008758 else {
8759 pbuf = formatbuf;
8760 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8761 flags, prec, c, v);
8762 if (len < 0)
8763 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008764 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008765 }
8766 if (flags & F_ZERO)
8767 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 break;
8769
8770 case 'e':
8771 case 'E':
8772 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008773 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 case 'g':
8775 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008776 if (c == 'F')
8777 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008778 pbuf = formatbuf;
8779 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8780 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781 if (len < 0)
8782 goto onError;
8783 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008784 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785 fill = '0';
8786 break;
8787
8788 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008789 pbuf = formatbuf;
8790 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 if (len < 0)
8792 goto onError;
8793 break;
8794
8795 default:
8796 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008797 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008798 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008799 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008800 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008801 (Py_ssize_t)(fmt - 1 -
8802 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803 goto onError;
8804 }
8805 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008806 if (*pbuf == '-' || *pbuf == '+') {
8807 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808 len--;
8809 }
8810 else if (flags & F_SIGN)
8811 sign = '+';
8812 else if (flags & F_BLANK)
8813 sign = ' ';
8814 else
8815 sign = 0;
8816 }
8817 if (width < len)
8818 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008819 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008820 reslen -= rescnt;
8821 rescnt = width + fmtcnt + 100;
8822 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008823 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008824 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008825 PyErr_NoMemory();
8826 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008827 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008828 if (_PyUnicode_Resize(&result, reslen) < 0) {
8829 Py_XDECREF(temp);
8830 goto onError;
8831 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832 res = PyUnicode_AS_UNICODE(result)
8833 + reslen - rescnt;
8834 }
8835 if (sign) {
8836 if (fill != ' ')
8837 *res++ = sign;
8838 rescnt--;
8839 if (width > len)
8840 width--;
8841 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008842 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008843 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008844 assert(pbuf[1] == c);
8845 if (fill != ' ') {
8846 *res++ = *pbuf++;
8847 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008848 }
Tim Petersfff53252001-04-12 18:38:48 +00008849 rescnt -= 2;
8850 width -= 2;
8851 if (width < 0)
8852 width = 0;
8853 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008854 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855 if (width > len && !(flags & F_LJUST)) {
8856 do {
8857 --rescnt;
8858 *res++ = fill;
8859 } while (--width > len);
8860 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008861 if (fill == ' ') {
8862 if (sign)
8863 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008864 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008865 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008866 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008867 *res++ = *pbuf++;
8868 *res++ = *pbuf++;
8869 }
8870 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008871 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 res += len;
8873 rescnt -= len;
8874 while (--width >= len) {
8875 --rescnt;
8876 *res++ = ' ';
8877 }
8878 if (dict && (argidx < arglen) && c != '%') {
8879 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008880 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008881 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 goto onError;
8883 }
8884 Py_XDECREF(temp);
8885 } /* '%' */
8886 } /* until end */
8887 if (argidx < arglen && !dict) {
8888 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008889 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890 goto onError;
8891 }
8892
Thomas Woutersa96affe2006-03-12 00:29:36 +00008893 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8894 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895 if (args_owned) {
8896 Py_DECREF(args);
8897 }
8898 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899 return (PyObject *)result;
8900
8901 onError:
8902 Py_XDECREF(result);
8903 Py_DECREF(uformat);
8904 if (args_owned) {
8905 Py_DECREF(args);
8906 }
8907 return NULL;
8908}
8909
Jeremy Hylton938ace62002-07-17 16:30:39 +00008910static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008911unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8912
Tim Peters6d6c1a32001-08-02 04:15:00 +00008913static PyObject *
8914unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8915{
8916 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008917 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008918 char *encoding = NULL;
8919 char *errors = NULL;
8920
Guido van Rossume023fe02001-08-30 03:12:59 +00008921 if (type != &PyUnicode_Type)
8922 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008923 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8924 kwlist, &x, &encoding, &errors))
8925 return NULL;
8926 if (x == NULL)
8927 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008928 if (encoding == NULL && errors == NULL)
8929 return PyObject_Unicode(x);
8930 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008931 return PyUnicode_FromEncodedObject(x, encoding, errors);
8932}
8933
Guido van Rossume023fe02001-08-30 03:12:59 +00008934static PyObject *
8935unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8936{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008937 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008938 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008939
8940 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8941 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8942 if (tmp == NULL)
8943 return NULL;
8944 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008945 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008946 if (pnew == NULL) {
8947 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008948 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008949 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008950 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8951 if (pnew->str == NULL) {
8952 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008953 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008954 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008955 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008956 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008957 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8958 pnew->length = n;
8959 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008960 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008961 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008962}
8963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008964PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008965"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008966\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008967Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008968encoding defaults to the current default string encoding.\n\
8969errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008970
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008971static PyObject *unicode_iter(PyObject *seq);
8972
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008974 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008975 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976 sizeof(PyUnicodeObject), /* tp_size */
8977 0, /* tp_itemsize */
8978 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008979 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008981 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008983 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008984 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008985 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008987 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988 (hashfunc) unicode_hash, /* tp_hash*/
8989 0, /* tp_call*/
8990 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008991 PyObject_GenericGetAttr, /* tp_getattro */
8992 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00008993 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008994 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8995 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008996 unicode_doc, /* tp_doc */
8997 0, /* tp_traverse */
8998 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008999 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009000 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009001 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009002 0, /* tp_iternext */
9003 unicode_methods, /* tp_methods */
9004 0, /* tp_members */
9005 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009006 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009007 0, /* tp_dict */
9008 0, /* tp_descr_get */
9009 0, /* tp_descr_set */
9010 0, /* tp_dictoffset */
9011 0, /* tp_init */
9012 0, /* tp_alloc */
9013 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009014 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015};
9016
9017/* Initialize the Unicode implementation */
9018
Thomas Wouters78890102000-07-22 19:25:51 +00009019void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009021 int i;
9022
Thomas Wouters477c8d52006-05-27 19:21:47 +00009023 /* XXX - move this array to unicodectype.c ? */
9024 Py_UNICODE linebreak[] = {
9025 0x000A, /* LINE FEED */
9026 0x000D, /* CARRIAGE RETURN */
9027 0x001C, /* FILE SEPARATOR */
9028 0x001D, /* GROUP SEPARATOR */
9029 0x001E, /* RECORD SEPARATOR */
9030 0x0085, /* NEXT LINE */
9031 0x2028, /* LINE SEPARATOR */
9032 0x2029, /* PARAGRAPH SEPARATOR */
9033 };
9034
Fred Drakee4315f52000-05-09 19:53:39 +00009035 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009036 unicode_freelist = NULL;
9037 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009039 if (!unicode_empty)
9040 return;
9041
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009042 for (i = 0; i < 256; i++)
9043 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009044 if (PyType_Ready(&PyUnicode_Type) < 0)
9045 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009046
9047 /* initialize the linebreak bloom filter */
9048 bloom_linebreak = make_bloom_mask(
9049 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9050 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009051
9052 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053}
9054
9055/* Finalize the Unicode implementation */
9056
9057void
Thomas Wouters78890102000-07-22 19:25:51 +00009058_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009060 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009061 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009063 Py_XDECREF(unicode_empty);
9064 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009065
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009066 for (i = 0; i < 256; i++) {
9067 if (unicode_latin1[i]) {
9068 Py_DECREF(unicode_latin1[i]);
9069 unicode_latin1[i] = NULL;
9070 }
9071 }
9072
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009073 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074 PyUnicodeObject *v = u;
9075 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00009076 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00009077 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00009078 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009079 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009081 unicode_freelist = NULL;
9082 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009084
Walter Dörwald16807132007-05-25 13:52:07 +00009085void
9086PyUnicode_InternInPlace(PyObject **p)
9087{
9088 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9089 PyObject *t;
9090 if (s == NULL || !PyUnicode_Check(s))
9091 Py_FatalError(
9092 "PyUnicode_InternInPlace: unicode strings only please!");
9093 /* If it's a subclass, we don't really know what putting
9094 it in the interned dict might do. */
9095 if (!PyUnicode_CheckExact(s))
9096 return;
9097 if (PyUnicode_CHECK_INTERNED(s))
9098 return;
9099 if (interned == NULL) {
9100 interned = PyDict_New();
9101 if (interned == NULL) {
9102 PyErr_Clear(); /* Don't leave an exception */
9103 return;
9104 }
9105 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009106 /* It might be that the GetItem call fails even
9107 though the key is present in the dictionary,
9108 namely when this happens during a stack overflow. */
9109 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009110 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009111 Py_END_ALLOW_RECURSION
9112
Walter Dörwald16807132007-05-25 13:52:07 +00009113 if (t) {
9114 Py_INCREF(t);
9115 Py_DECREF(*p);
9116 *p = t;
9117 return;
9118 }
9119
Martin v. Löwis5b222132007-06-10 09:51:05 +00009120 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009121 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9122 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009123 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009124 return;
9125 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009126 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009127 /* The two references in interned are not counted by refcnt.
9128 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009129 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009130 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9131}
9132
9133void
9134PyUnicode_InternImmortal(PyObject **p)
9135{
9136 PyUnicode_InternInPlace(p);
9137 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9138 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9139 Py_INCREF(*p);
9140 }
9141}
9142
9143PyObject *
9144PyUnicode_InternFromString(const char *cp)
9145{
9146 PyObject *s = PyUnicode_FromString(cp);
9147 if (s == NULL)
9148 return NULL;
9149 PyUnicode_InternInPlace(&s);
9150 return s;
9151}
9152
9153void _Py_ReleaseInternedUnicodeStrings(void)
9154{
9155 PyObject *keys;
9156 PyUnicodeObject *s;
9157 Py_ssize_t i, n;
9158 Py_ssize_t immortal_size = 0, mortal_size = 0;
9159
9160 if (interned == NULL || !PyDict_Check(interned))
9161 return;
9162 keys = PyDict_Keys(interned);
9163 if (keys == NULL || !PyList_Check(keys)) {
9164 PyErr_Clear();
9165 return;
9166 }
9167
9168 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9169 detector, interned unicode strings are not forcibly deallocated;
9170 rather, we give them their stolen references back, and then clear
9171 and DECREF the interned dict. */
9172
9173 n = PyList_GET_SIZE(keys);
9174 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9175 n);
9176 for (i = 0; i < n; i++) {
9177 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9178 switch (s->state) {
9179 case SSTATE_NOT_INTERNED:
9180 /* XXX Shouldn't happen */
9181 break;
9182 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009183 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009184 immortal_size += s->length;
9185 break;
9186 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009187 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009188 mortal_size += s->length;
9189 break;
9190 default:
9191 Py_FatalError("Inconsistent interned string state.");
9192 }
9193 s->state = SSTATE_NOT_INTERNED;
9194 }
9195 fprintf(stderr, "total size of all interned strings: "
9196 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9197 "mortal/immortal\n", mortal_size, immortal_size);
9198 Py_DECREF(keys);
9199 PyDict_Clear(interned);
9200 Py_DECREF(interned);
9201 interned = NULL;
9202}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009203
9204
9205/********************* Unicode Iterator **************************/
9206
9207typedef struct {
9208 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009209 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009210 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9211} unicodeiterobject;
9212
9213static void
9214unicodeiter_dealloc(unicodeiterobject *it)
9215{
9216 _PyObject_GC_UNTRACK(it);
9217 Py_XDECREF(it->it_seq);
9218 PyObject_GC_Del(it);
9219}
9220
9221static int
9222unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9223{
9224 Py_VISIT(it->it_seq);
9225 return 0;
9226}
9227
9228static PyObject *
9229unicodeiter_next(unicodeiterobject *it)
9230{
9231 PyUnicodeObject *seq;
9232 PyObject *item;
9233
9234 assert(it != NULL);
9235 seq = it->it_seq;
9236 if (seq == NULL)
9237 return NULL;
9238 assert(PyUnicode_Check(seq));
9239
9240 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009241 item = PyUnicode_FromUnicode(
9242 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009243 if (item != NULL)
9244 ++it->it_index;
9245 return item;
9246 }
9247
9248 Py_DECREF(seq);
9249 it->it_seq = NULL;
9250 return NULL;
9251}
9252
9253static PyObject *
9254unicodeiter_len(unicodeiterobject *it)
9255{
9256 Py_ssize_t len = 0;
9257 if (it->it_seq)
9258 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9259 return PyInt_FromSsize_t(len);
9260}
9261
9262PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9263
9264static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009265 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9266 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009267 {NULL, NULL} /* sentinel */
9268};
9269
9270PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009271 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009272 "unicodeiterator", /* tp_name */
9273 sizeof(unicodeiterobject), /* tp_basicsize */
9274 0, /* tp_itemsize */
9275 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009276 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009277 0, /* tp_print */
9278 0, /* tp_getattr */
9279 0, /* tp_setattr */
9280 0, /* tp_compare */
9281 0, /* tp_repr */
9282 0, /* tp_as_number */
9283 0, /* tp_as_sequence */
9284 0, /* tp_as_mapping */
9285 0, /* tp_hash */
9286 0, /* tp_call */
9287 0, /* tp_str */
9288 PyObject_GenericGetAttr, /* tp_getattro */
9289 0, /* tp_setattro */
9290 0, /* tp_as_buffer */
9291 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9292 0, /* tp_doc */
9293 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9294 0, /* tp_clear */
9295 0, /* tp_richcompare */
9296 0, /* tp_weaklistoffset */
9297 PyObject_SelfIter, /* tp_iter */
9298 (iternextfunc)unicodeiter_next, /* tp_iternext */
9299 unicodeiter_methods, /* tp_methods */
9300 0,
9301};
9302
9303static PyObject *
9304unicode_iter(PyObject *seq)
9305{
9306 unicodeiterobject *it;
9307
9308 if (!PyUnicode_Check(seq)) {
9309 PyErr_BadInternalCall();
9310 return NULL;
9311 }
9312 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9313 if (it == NULL)
9314 return NULL;
9315 it->it_index = 0;
9316 Py_INCREF(seq);
9317 it->it_seq = (PyUnicodeObject *)seq;
9318 _PyObject_GC_TRACK(it);
9319 return (PyObject *)it;
9320}
9321
Martin v. Löwis5b222132007-06-10 09:51:05 +00009322size_t
9323Py_UNICODE_strlen(const Py_UNICODE *u)
9324{
9325 int res = 0;
9326 while(*u++)
9327 res++;
9328 return res;
9329}
9330
9331Py_UNICODE*
9332Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9333{
9334 Py_UNICODE *u = s1;
9335 while ((*u++ = *s2++));
9336 return s1;
9337}
9338
9339Py_UNICODE*
9340Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9341{
9342 Py_UNICODE *u = s1;
9343 while ((*u++ = *s2++))
9344 if (n-- == 0)
9345 break;
9346 return s1;
9347}
9348
9349int
9350Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9351{
9352 while (*s1 && *s2 && *s1 == *s2)
9353 s1++, s2++;
9354 if (*s1 && *s2)
9355 return (*s1 < *s2) ? -1 : +1;
9356 if (*s1)
9357 return 1;
9358 if (*s2)
9359 return -1;
9360 return 0;
9361}
9362
9363Py_UNICODE*
9364Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9365{
9366 const Py_UNICODE *p;
9367 for (p = s; *p; p++)
9368 if (*p == c)
9369 return (Py_UNICODE*)p;
9370 return NULL;
9371}
9372
9373
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009374#ifdef __cplusplus
9375}
9376#endif
9377
9378
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009379/*
9380Local variables:
9381c-basic-offset: 4
9382indent-tabs-mode: nil
9383End:
9384*/