blob: ae34c9e589ecc44dd335e4b792be0c2e55965ada [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Eric Smith8c663262007-08-25 02:26:07 +000049#include "formatter_unicode.h"
50
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000051#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000052#include <windows.h>
53#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000054
Guido van Rossumd57fd912000-03-10 22:53:23 +000055/* Limit for the Unicode object free list */
56
57#define MAX_UNICODE_FREELIST_SIZE 1024
58
59/* Limit for the Unicode object free list stay alive optimization.
60
61 The implementation will keep allocated Unicode memory intact for
62 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000063 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Barry Warsaw51ac5802000-03-20 16:36:48 +000065 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000067 malloc()-overhead) bytes of unused garbage.
68
69 Setting the limit to 0 effectively turns the feature off.
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071 Note: This is an experimental feature ! If you get core dumps when
72 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000073
74*/
75
Guido van Rossumfd4b9572000-04-10 13:51:10 +000076#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
78/* Endianness switches; defaults to little endian */
79
80#ifdef WORDS_BIGENDIAN
81# define BYTEORDER_IS_BIG_ENDIAN
82#else
83# define BYTEORDER_IS_LITTLE_ENDIAN
84#endif
85
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086/* --- Globals ------------------------------------------------------------
87
88 The globals are initialized by the _PyUnicode_Init() API and should
89 not be used before calling that API.
90
91*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093
94#ifdef __cplusplus
95extern "C" {
96#endif
97
Walter Dörwald16807132007-05-25 13:52:07 +000098/* This dictionary holds all interned unicode strings. Note that references
99 to strings in this dictionary are *not* counted in the string's ob_refcnt.
100 When the interned string reaches a refcnt of 0 the string deallocation
101 function will delete the reference from this dictionary.
102
103 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000104 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000105*/
106static PyObject *interned;
107
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000109static PyUnicodeObject *unicode_freelist;
110static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000112/* The empty Unicode object is shared to improve performance. */
113static PyUnicodeObject *unicode_empty;
114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117static PyUnicodeObject *unicode_latin1[256];
118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000120 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000121 PyUnicode_GetDefaultEncoding() API to access this global.
122
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000123 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000124 hard coded default!
125*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000126static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000128Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000129PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000130{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000131#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000132 return 0x10FFFF;
133#else
134 /* This is actually an illegal character, so it should
135 not be passed to unichr. */
136 return 0xFFFF;
137#endif
138}
139
Thomas Wouters477c8d52006-05-27 19:21:47 +0000140/* --- Bloom Filters ----------------------------------------------------- */
141
142/* stuff to implement simple "bloom filters" for Unicode characters.
143 to keep things simple, we use a single bitmask, using the least 5
144 bits from each unicode characters as the bit index. */
145
146/* the linebreak mask is set up by Unicode_Init below */
147
148#define BLOOM_MASK unsigned long
149
150static BLOOM_MASK bloom_linebreak;
151
152#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
153
154#define BLOOM_LINEBREAK(ch)\
155 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
156
157Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
158{
159 /* calculate simple bloom-style bitmask for a given unicode string */
160
161 long mask;
162 Py_ssize_t i;
163
164 mask = 0;
165 for (i = 0; i < len; i++)
166 mask |= (1 << (ptr[i] & 0x1F));
167
168 return mask;
169}
170
171Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
172{
173 Py_ssize_t i;
174
175 for (i = 0; i < setlen; i++)
176 if (set[i] == chr)
177 return 1;
178
179 return 0;
180}
181
182#define BLOOM_MEMBER(mask, chr, set, setlen)\
183 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
184
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185/* --- Unicode Object ----------------------------------------------------- */
186
187static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000189 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190{
191 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000192
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000193 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 /* Resizing shared object (unicode_empty or single character
198 objects) in-place is not allowed. Use PyUnicode_Resize()
199 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 if (unicode == unicode_empty ||
202 (unicode->length == 1 &&
203 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000206 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 return -1;
208 }
209
Thomas Wouters477c8d52006-05-27 19:21:47 +0000210 /* We allocate one more byte to make sure the string is Ux0000 terminated.
211 The overallocation is also used by fastsearch, which assumes that it's
212 safe to look at str[length] (without making any assumptions about what
213 it contains). */
214
Guido van Rossumd57fd912000-03-10 22:53:23 +0000215 oldstr = unicode->str;
216 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
217 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 PyErr_NoMemory();
220 return -1;
221 }
222 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000223 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000225 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000227 if (unicode->defenc) {
228 Py_DECREF(unicode->defenc);
229 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 }
231 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000232
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 return 0;
234}
235
236/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000237 Ux0000 terminated; some code (e.g. new_identifier)
238 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239
240 XXX This allocator could further be enhanced by assuring that the
241 free list never reduces its size below 1.
242
243*/
244
245static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 register PyUnicodeObject *unicode;
249
Thomas Wouters477c8d52006-05-27 19:21:47 +0000250 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (length == 0 && unicode_empty != NULL) {
252 Py_INCREF(unicode_empty);
253 return unicode_empty;
254 }
255
256 /* Unicode freelist & memory allocation */
257 if (unicode_freelist) {
258 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000259 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Keep-Alive optimization: we only upsize the buffer,
263 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000264 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000265 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000266 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 }
269 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000270 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000272 }
273 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 }
275 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000276 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (unicode == NULL)
278 return NULL;
279 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
280 }
281
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000282 if (!unicode->str) {
283 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000284 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000285 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000286 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000287 * the caller fails before initializing str -- unicode_resize()
288 * reads str[0], and the Keep-Alive optimization can keep memory
289 * allocated for str alive across a call to unicode_dealloc(unicode).
290 * We don't want unicode_resize to read uninitialized memory in
291 * that case.
292 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000293 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000297 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000298 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000300
301 onError:
302 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000303 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305}
306
307static
Guido van Rossum9475a232001-10-05 20:51:39 +0000308void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309{
Walter Dörwald16807132007-05-25 13:52:07 +0000310 switch (PyUnicode_CHECK_INTERNED(unicode)) {
311 case SSTATE_NOT_INTERNED:
312 break;
313
314 case SSTATE_INTERNED_MORTAL:
315 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000316 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000317 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
318 Py_FatalError(
319 "deletion of interned unicode string failed");
320 break;
321
322 case SSTATE_INTERNED_IMMORTAL:
323 Py_FatalError("Immortal interned unicode string died.");
324
325 default:
326 Py_FatalError("Inconsistent interned unicode string state.");
327 }
328
Guido van Rossum604ddf82001-12-06 20:03:56 +0000329 if (PyUnicode_CheckExact(unicode) &&
330 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000331 /* Keep-Alive optimization */
332 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000333 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 unicode->str = NULL;
335 unicode->length = 0;
336 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000337 if (unicode->defenc) {
338 Py_DECREF(unicode->defenc);
339 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000340 }
341 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 *(PyUnicodeObject **)unicode = unicode_freelist;
343 unicode_freelist = unicode;
344 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000347 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000348 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000349 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 }
351}
352
Martin v. Löwis18e16552006-02-15 17:27:45 +0000353int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000354{
355 register PyUnicodeObject *v;
356
357 /* Argument checks */
358 if (unicode == NULL) {
359 PyErr_BadInternalCall();
360 return -1;
361 }
362 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000363 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 PyErr_BadInternalCall();
365 return -1;
366 }
367
368 /* Resizing unicode_empty and single character objects is not
369 possible since these are being shared. We simply return a fresh
370 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000371 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000372 (v == unicode_empty || v->length == 1)) {
373 PyUnicodeObject *w = _PyUnicode_New(length);
374 if (w == NULL)
375 return -1;
376 Py_UNICODE_COPY(w->str, v->str,
377 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000378 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 *unicode = (PyObject *)w;
380 return 0;
381 }
382
383 /* Note that we don't have to modify *unicode for unshared Unicode
384 objects, since we can modify them in-place. */
385 return unicode_resize(v, length);
386}
387
388/* Internal API for use in unicodeobject.c only ! */
389#define _PyUnicode_Resize(unicodevar, length) \
390 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
391
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000393 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394{
395 PyUnicodeObject *unicode;
396
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000397 /* If the Unicode data is known at construction time, we can apply
398 some optimizations which share commonly used objects. */
399 if (u != NULL) {
400
401 /* Optimization for empty strings */
402 if (size == 0 && unicode_empty != NULL) {
403 Py_INCREF(unicode_empty);
404 return (PyObject *)unicode_empty;
405 }
406
407 /* Single character Unicode objects in the Latin-1 range are
408 shared when using this constructor */
409 if (size == 1 && *u < 256) {
410 unicode = unicode_latin1[*u];
411 if (!unicode) {
412 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000413 if (!unicode)
414 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000415 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000416 unicode_latin1[*u] = unicode;
417 }
418 Py_INCREF(unicode);
419 return (PyObject *)unicode;
420 }
421 }
Tim Petersced69f82003-09-16 20:30:58 +0000422
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 unicode = _PyUnicode_New(size);
424 if (!unicode)
425 return NULL;
426
427 /* Copy the Unicode data into the new object */
428 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430
431 return (PyObject *)unicode;
432}
433
Walter Dörwaldd2034312007-05-18 16:29:38 +0000434PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000435{
436 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000437 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000438 some optimizations which share commonly used objects.
439 Also, this means the input must be UTF-8, so fall back to the
440 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000441 if (u != NULL) {
442
443 /* Optimization for empty strings */
444 if (size == 0 && unicode_empty != NULL) {
445 Py_INCREF(unicode_empty);
446 return (PyObject *)unicode_empty;
447 }
448
Martin v. Löwis9c121062007-08-05 20:26:11 +0000449 /* Single characters are shared when using this constructor.
450 Restrict to ASCII, since the input must be UTF-8. */
451 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000452 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000453 if (!unicode) {
454 unicode = _PyUnicode_New(1);
455 if (!unicode)
456 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000457 unicode->str[0] = Py_CHARMASK(*u);
458 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000459 }
460 Py_INCREF(unicode);
461 return (PyObject *)unicode;
462 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000463
464 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000465 }
466
Walter Dörwald55507312007-05-18 13:12:10 +0000467 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000468 if (!unicode)
469 return NULL;
470
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000471 return (PyObject *)unicode;
472}
473
Walter Dörwaldd2034312007-05-18 16:29:38 +0000474PyObject *PyUnicode_FromString(const char *u)
475{
476 size_t size = strlen(u);
477 if (size > PY_SSIZE_T_MAX) {
478 PyErr_SetString(PyExc_OverflowError, "input too long");
479 return NULL;
480 }
481
482 return PyUnicode_FromStringAndSize(u, size);
483}
484
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485#ifdef HAVE_WCHAR_H
486
487PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000488 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489{
490 PyUnicodeObject *unicode;
491
492 if (w == NULL) {
493 PyErr_BadInternalCall();
494 return NULL;
495 }
496
497 unicode = _PyUnicode_New(size);
498 if (!unicode)
499 return NULL;
500
501 /* Copy the wchar_t data into the new object */
502#ifdef HAVE_USABLE_WCHAR_T
503 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000504#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 {
506 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000507 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000509 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510 *u++ = *w++;
511 }
512#endif
513
514 return (PyObject *)unicode;
515}
516
Walter Dörwald346737f2007-05-31 10:44:43 +0000517static void
518makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
519{
520 *fmt++ = '%';
521 if (width) {
522 if (zeropad)
523 *fmt++ = '0';
524 fmt += sprintf(fmt, "%d", width);
525 }
526 if (precision)
527 fmt += sprintf(fmt, ".%d", precision);
528 if (longflag)
529 *fmt++ = 'l';
530 else if (size_tflag) {
531 char *f = PY_FORMAT_SIZE_T;
532 while (*f)
533 *fmt++ = *f++;
534 }
535 *fmt++ = c;
536 *fmt = '\0';
537}
538
Walter Dörwaldd2034312007-05-18 16:29:38 +0000539#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
540
541PyObject *
542PyUnicode_FromFormatV(const char *format, va_list vargs)
543{
544 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000545 Py_ssize_t callcount = 0;
546 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000547 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000548 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000549 int width = 0;
550 int precision = 0;
551 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000552 const char* f;
553 Py_UNICODE *s;
554 PyObject *string;
555 /* used by sprintf */
556 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000557 /* use abuffer instead of buffer, if we need more space
558 * (which can happen if there's a format specifier with width). */
559 char *abuffer = NULL;
560 char *realbuffer;
561 Py_ssize_t abuffersize = 0;
562 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000563 const char *copy;
564
565#ifdef VA_LIST_IS_ARRAY
566 Py_MEMCPY(count, vargs, sizeof(va_list));
567#else
568#ifdef __va_copy
569 __va_copy(count, vargs);
570#else
571 count = vargs;
572#endif
573#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 /* step 1: count the number of %S/%R format specifications
575 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
576 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000577 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000578 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000579 ++callcount;
580 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000581 /* step 2: allocate memory for the results of
582 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000583 if (callcount) {
584 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
585 if (!callresults) {
586 PyErr_NoMemory();
587 return NULL;
588 }
589 callresult = callresults;
590 }
591 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000592 for (f = format; *f; f++) {
593 if (*f == '%') {
594 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000595 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000596 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000597 width = (width*10) + *f++ - '0';
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000598 while (*++f && *f != '%' && !ISALPHA(*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000599 ;
600
601 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
602 * they don't affect the amount of space we reserve.
603 */
604 if ((*f == 'l' || *f == 'z') &&
605 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000606 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000607
608 switch (*f) {
609 case 'c':
610 (void)va_arg(count, int);
611 /* fall through... */
612 case '%':
613 n++;
614 break;
615 case 'd': case 'u': case 'i': case 'x':
616 (void) va_arg(count, int);
617 /* 20 bytes is enough to hold a 64-bit
618 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000619 This isn't enough for octal.
620 If a width is specified we need more
621 (which we allocate later). */
622 if (width < 20)
623 width = 20;
624 n += width;
625 if (abuffersize < width)
626 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000627 break;
628 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000629 {
630 /* UTF-8 */
631 unsigned char*s;
632 s = va_arg(count, unsigned char*);
633 while (*s) {
634 if (*s < 128) {
635 n++; s++;
636 } else if (*s < 0xc0) {
637 /* invalid UTF-8 */
638 n++; s++;
639 } else if (*s < 0xc0) {
640 n++;
641 s++; if(!*s)break;
642 s++;
643 } else if (*s < 0xe0) {
644 n++;
645 s++; if(!*s)break;
646 s++; if(!*s)break;
647 s++;
648 } else {
649 #ifdef Py_UNICODE_WIDE
650 n++;
651 #else
652 n+=2;
653 #endif
654 s++; if(!*s)break;
655 s++; if(!*s)break;
656 s++; if(!*s)break;
657 s++;
658 }
659 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000660 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000661 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000662 case 'U':
663 {
664 PyObject *obj = va_arg(count, PyObject *);
665 assert(obj && PyUnicode_Check(obj));
666 n += PyUnicode_GET_SIZE(obj);
667 break;
668 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000669 case 'V':
670 {
671 PyObject *obj = va_arg(count, PyObject *);
672 const char *str = va_arg(count, const char *);
673 assert(obj || str);
674 assert(!obj || PyUnicode_Check(obj));
675 if (obj)
676 n += PyUnicode_GET_SIZE(obj);
677 else
678 n += strlen(str);
679 break;
680 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000681 case 'S':
682 {
683 PyObject *obj = va_arg(count, PyObject *);
684 PyObject *str;
685 assert(obj);
686 str = PyObject_Unicode(obj);
687 if (!str)
688 goto fail;
689 n += PyUnicode_GET_SIZE(str);
690 /* Remember the str and switch to the next slot */
691 *callresult++ = str;
692 break;
693 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000694 case 'R':
695 {
696 PyObject *obj = va_arg(count, PyObject *);
697 PyObject *repr;
698 assert(obj);
699 repr = PyObject_Repr(obj);
700 if (!repr)
701 goto fail;
702 n += PyUnicode_GET_SIZE(repr);
703 /* Remember the repr and switch to the next slot */
704 *callresult++ = repr;
705 break;
706 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000707 case 'p':
708 (void) va_arg(count, int);
709 /* maximum 64-bit pointer representation:
710 * 0xffffffffffffffff
711 * so 19 characters is enough.
712 * XXX I count 18 -- what's the extra for?
713 */
714 n += 19;
715 break;
716 default:
717 /* if we stumble upon an unknown
718 formatting code, copy the rest of
719 the format string to the output
720 string. (we cannot just skip the
721 code, since there's no way to know
722 what's in the argument list) */
723 n += strlen(p);
724 goto expand;
725 }
726 } else
727 n++;
728 }
729 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000730 if (abuffersize > 20) {
731 abuffer = PyMem_Malloc(abuffersize);
732 if (!abuffer) {
733 PyErr_NoMemory();
734 goto fail;
735 }
736 realbuffer = abuffer;
737 }
738 else
739 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000740 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000741 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000742 we don't have to resize the string.
743 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 string = PyUnicode_FromUnicode(NULL, n);
745 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000746 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000747
748 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000749 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750
751 for (f = format; *f; f++) {
752 if (*f == '%') {
753 const char* p = f++;
754 int longflag = 0;
755 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000756 zeropad = (*f == '0');
757 /* parse the width.precision part */
758 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000759 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 width = (width*10) + *f++ - '0';
761 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 if (*f == '.') {
763 f++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000764 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767 /* handle the long flag, but only for %ld and %lu.
768 others can be added when necessary. */
769 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
770 longflag = 1;
771 ++f;
772 }
773 /* handle the size_t flag. */
774 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
775 size_tflag = 1;
776 ++f;
777 }
778
779 switch (*f) {
780 case 'c':
781 *s++ = va_arg(vargs, int);
782 break;
783 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000784 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000785 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000786 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000787 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000788 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000789 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000790 sprintf(realbuffer, fmt, va_arg(vargs, int));
791 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000792 break;
793 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000794 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000795 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000796 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000798 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000799 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000800 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
801 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000802 break;
803 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000804 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
805 sprintf(realbuffer, fmt, va_arg(vargs, int));
806 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000807 break;
808 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000809 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
810 sprintf(realbuffer, fmt, va_arg(vargs, int));
811 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000812 break;
813 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000814 {
815 /* Parameter must be UTF-8 encoded.
816 In case of encoding errors, use
817 the replacement character. */
818 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000819 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000820 u = PyUnicode_DecodeUTF8(p, strlen(p),
821 "replace");
822 if (!u)
823 goto fail;
824 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
825 PyUnicode_GET_SIZE(u));
826 s += PyUnicode_GET_SIZE(u);
827 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000829 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000830 case 'U':
831 {
832 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000833 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
834 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
835 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000836 break;
837 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000838 case 'V':
839 {
840 PyObject *obj = va_arg(vargs, PyObject *);
841 const char *str = va_arg(vargs, const char *);
842 if (obj) {
843 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
844 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
845 s += size;
846 } else {
847 appendstring(str);
848 }
849 break;
850 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000851 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000852 case 'R':
853 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000854 Py_UNICODE *ucopy;
855 Py_ssize_t usize;
856 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000857 /* unused, since we already have the result */
858 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000859 ucopy = PyUnicode_AS_UNICODE(*callresult);
860 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000861 for (upos = 0; upos<usize;)
862 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000863 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000864 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000865 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000866 ++callresult;
867 break;
868 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000869 case 'p':
870 sprintf(buffer, "%p", va_arg(vargs, void*));
871 /* %p is ill-defined: ensure leading 0x. */
872 if (buffer[1] == 'X')
873 buffer[1] = 'x';
874 else if (buffer[1] != 'x') {
875 memmove(buffer+2, buffer, strlen(buffer)+1);
876 buffer[0] = '0';
877 buffer[1] = 'x';
878 }
879 appendstring(buffer);
880 break;
881 case '%':
882 *s++ = '%';
883 break;
884 default:
885 appendstring(p);
886 goto end;
887 }
888 } else
889 *s++ = *f;
890 }
891
892 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000893 if (callresults)
894 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000895 if (abuffer)
896 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000897 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
898 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000899 fail:
900 if (callresults) {
901 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000902 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000903 Py_DECREF(*callresult2);
904 ++callresult2;
905 }
906 PyMem_Free(callresults);
907 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000908 if (abuffer)
909 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000910 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000911}
912
913#undef appendstring
914
915PyObject *
916PyUnicode_FromFormat(const char *format, ...)
917{
918 PyObject* ret;
919 va_list vargs;
920
921#ifdef HAVE_STDARG_PROTOTYPES
922 va_start(vargs, format);
923#else
924 va_start(vargs);
925#endif
926 ret = PyUnicode_FromFormatV(format, vargs);
927 va_end(vargs);
928 return ret;
929}
930
Martin v. Löwis18e16552006-02-15 17:27:45 +0000931Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
932 wchar_t *w,
933 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934{
935 if (unicode == NULL) {
936 PyErr_BadInternalCall();
937 return -1;
938 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000939
940 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000942 size = PyUnicode_GET_SIZE(unicode) + 1;
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944#ifdef HAVE_USABLE_WCHAR_T
945 memcpy(w, unicode->str, size * sizeof(wchar_t));
946#else
947 {
948 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000949 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000950 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000951 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000952 *w++ = *u++;
953 }
954#endif
955
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000956 if (size > PyUnicode_GET_SIZE(unicode))
957 return PyUnicode_GET_SIZE(unicode);
958 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000959 return size;
960}
961
962#endif
963
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000964PyObject *PyUnicode_FromOrdinal(int ordinal)
965{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000966 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000967
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000968 if (ordinal < 0 || ordinal > 0x10ffff) {
969 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000970 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000971 return NULL;
972 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000973
974#ifndef Py_UNICODE_WIDE
975 if (ordinal > 0xffff) {
976 ordinal -= 0x10000;
977 s[0] = 0xD800 | (ordinal >> 10);
978 s[1] = 0xDC00 | (ordinal & 0x3FF);
979 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000980 }
981#endif
982
Hye-Shik Chang40574832004-04-06 07:24:51 +0000983 s[0] = (Py_UNICODE)ordinal;
984 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000985}
986
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987PyObject *PyUnicode_FromObject(register PyObject *obj)
988{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000989 /* XXX Perhaps we should make this API an alias of
990 PyObject_Unicode() instead ?! */
991 if (PyUnicode_CheckExact(obj)) {
992 Py_INCREF(obj);
993 return obj;
994 }
995 if (PyUnicode_Check(obj)) {
996 /* For a Unicode subtype that's not a Unicode object,
997 return a true Unicode object with the same data. */
998 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
999 PyUnicode_GET_SIZE(obj));
1000 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001001 PyErr_Format(PyExc_TypeError,
1002 "Can't convert '%.100s' object to str implicitly",
1003 Py_Type(obj)->tp_name);
1004 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001005}
1006
1007PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1008 const char *encoding,
1009 const char *errors)
1010{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001011 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001012 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001013 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001014
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015 if (obj == NULL) {
1016 PyErr_BadInternalCall();
1017 return NULL;
1018 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001019
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001020 if (PyUnicode_Check(obj)) {
1021 PyErr_SetString(PyExc_TypeError,
1022 "decoding Unicode is not supported");
1023 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001024 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001025
1026 /* Coerce object */
1027 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001028 s = PyString_AS_STRING(obj);
1029 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001030 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001031 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1032 /* Overwrite the error message with something more useful in
1033 case of a TypeError. */
1034 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001035 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001036 "coercing to Unicode: need string or buffer, "
1037 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001038 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001039 goto onError;
1040 }
Tim Petersced69f82003-09-16 20:30:58 +00001041
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001042 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 if (len == 0) {
1044 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001045 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046 }
Tim Petersced69f82003-09-16 20:30:58 +00001047 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001048 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001049
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001050 return v;
1051
1052 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054}
1055
1056PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001057 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058 const char *encoding,
1059 const char *errors)
1060{
1061 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001062 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001063 char lower[20]; /* Enough for any encoding name we recognize */
1064 char *l;
1065 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066
1067 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001068 encoding = PyUnicode_GetDefaultEncoding();
1069
1070 /* Convert encoding to lower case and replace '_' with '-' in order to
1071 catch e.g. UTF_8 */
1072 e = encoding;
1073 l = lower;
1074 while (*e && l < &lower[(sizeof lower) - 2]) {
1075 if (ISUPPER(*e)) {
1076 *l++ = TOLOWER(*e++);
1077 }
1078 else if (*e == '_') {
1079 *l++ = '-';
1080 e++;
1081 }
1082 else {
1083 *l++ = *e++;
1084 }
1085 }
1086 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001087
1088 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001089 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001090 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001091 else if ((strcmp(lower, "latin-1") == 0) ||
1092 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001093 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001094#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001095 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001096 return PyUnicode_DecodeMBCS(s, size, errors);
1097#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001098 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001099 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001100 else if (strcmp(lower, "utf-16") == 0)
1101 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1102 else if (strcmp(lower, "utf-32") == 0)
1103 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001104
1105 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001106 buffer = NULL;
1107 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1108 goto onError;
1109 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (buffer == NULL)
1111 goto onError;
1112 unicode = PyCodec_Decode(buffer, encoding, errors);
1113 if (unicode == NULL)
1114 goto onError;
1115 if (!PyUnicode_Check(unicode)) {
1116 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001117 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001118 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119 Py_DECREF(unicode);
1120 goto onError;
1121 }
1122 Py_DECREF(buffer);
1123 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001124
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125 onError:
1126 Py_XDECREF(buffer);
1127 return NULL;
1128}
1129
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001130PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1131 const char *encoding,
1132 const char *errors)
1133{
1134 PyObject *v;
1135
1136 if (!PyUnicode_Check(unicode)) {
1137 PyErr_BadArgument();
1138 goto onError;
1139 }
1140
1141 if (encoding == NULL)
1142 encoding = PyUnicode_GetDefaultEncoding();
1143
1144 /* Decode via the codec registry */
1145 v = PyCodec_Decode(unicode, encoding, errors);
1146 if (v == NULL)
1147 goto onError;
1148 return v;
1149
1150 onError:
1151 return NULL;
1152}
1153
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001155 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 const char *encoding,
1157 const char *errors)
1158{
1159 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001160
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 unicode = PyUnicode_FromUnicode(s, size);
1162 if (unicode == NULL)
1163 return NULL;
1164 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1165 Py_DECREF(unicode);
1166 return v;
1167}
1168
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001169PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1170 const char *encoding,
1171 const char *errors)
1172{
1173 PyObject *v;
1174
1175 if (!PyUnicode_Check(unicode)) {
1176 PyErr_BadArgument();
1177 goto onError;
1178 }
1179
1180 if (encoding == NULL)
1181 encoding = PyUnicode_GetDefaultEncoding();
1182
1183 /* Encode via the codec registry */
1184 v = PyCodec_Encode(unicode, encoding, errors);
1185 if (v == NULL)
1186 goto onError;
1187 return v;
1188
1189 onError:
1190 return NULL;
1191}
1192
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1194 const char *encoding,
1195 const char *errors)
1196{
1197 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001198
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199 if (!PyUnicode_Check(unicode)) {
1200 PyErr_BadArgument();
1201 goto onError;
1202 }
Fred Drakee4315f52000-05-09 19:53:39 +00001203
Tim Petersced69f82003-09-16 20:30:58 +00001204 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001205 encoding = PyUnicode_GetDefaultEncoding();
1206
1207 /* Shortcuts for common default encodings */
1208 if (errors == NULL) {
1209 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001210 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001211 else if (strcmp(encoding, "latin-1") == 0)
1212 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001213#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1214 else if (strcmp(encoding, "mbcs") == 0)
1215 return PyUnicode_AsMBCSString(unicode);
1216#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001217 else if (strcmp(encoding, "ascii") == 0)
1218 return PyUnicode_AsASCIIString(unicode);
1219 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220
1221 /* Encode via the codec registry */
1222 v = PyCodec_Encode(unicode, encoding, errors);
1223 if (v == NULL)
1224 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001225 assert(PyString_Check(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001227
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228 onError:
1229 return NULL;
1230}
1231
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001232PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1233 const char *errors)
1234{
1235 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001236 if (v)
1237 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001238 if (errors != NULL)
1239 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001240 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001241 PyUnicode_GET_SIZE(unicode),
1242 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001243 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001244 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001245 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001246 return v;
1247}
1248
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001249PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001250PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001251 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001252 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1253}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001254
Christian Heimes5894ba72007-11-04 11:43:14 +00001255PyObject*
1256PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1257{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001258 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1259 can be undefined. If it is case, decode using UTF-8. The following assumes
1260 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1261 bootstrapping process where the codecs aren't ready yet.
1262 */
1263 if (Py_FileSystemDefaultEncoding) {
1264#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001265 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001266 return PyUnicode_DecodeMBCS(s, size, "replace");
1267 }
1268#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001269 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001270 return PyUnicode_DecodeUTF8(s, size, "replace");
1271 }
1272#endif
1273 return PyUnicode_Decode(s, size,
1274 Py_FileSystemDefaultEncoding,
1275 "replace");
1276 }
1277 else {
1278 return PyUnicode_DecodeUTF8(s, size, "replace");
1279 }
1280}
1281
Martin v. Löwis5b222132007-06-10 09:51:05 +00001282char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001283PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001284{
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001285 PyObject *str8;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001286 if (!PyUnicode_Check(unicode)) {
1287 PyErr_BadArgument();
1288 return NULL;
1289 }
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001290 str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1291 if (str8 == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001292 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001293 if (psize != NULL)
1294 *psize = PyString_GET_SIZE(str8);
1295 return PyString_AS_STRING(str8);
1296}
1297
1298char*
1299PyUnicode_AsString(PyObject *unicode)
1300{
1301 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001302}
1303
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1305{
1306 if (!PyUnicode_Check(unicode)) {
1307 PyErr_BadArgument();
1308 goto onError;
1309 }
1310 return PyUnicode_AS_UNICODE(unicode);
1311
1312 onError:
1313 return NULL;
1314}
1315
Martin v. Löwis18e16552006-02-15 17:27:45 +00001316Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317{
1318 if (!PyUnicode_Check(unicode)) {
1319 PyErr_BadArgument();
1320 goto onError;
1321 }
1322 return PyUnicode_GET_SIZE(unicode);
1323
1324 onError:
1325 return -1;
1326}
1327
Thomas Wouters78890102000-07-22 19:25:51 +00001328const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001329{
1330 return unicode_default_encoding;
1331}
1332
1333int PyUnicode_SetDefaultEncoding(const char *encoding)
1334{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001335 if (strcmp(encoding, unicode_default_encoding) != 0) {
1336 PyErr_Format(PyExc_ValueError,
1337 "Can only set default encoding to %s",
1338 unicode_default_encoding);
1339 return -1;
1340 }
Fred Drakee4315f52000-05-09 19:53:39 +00001341 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001342}
1343
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344/* error handling callback helper:
1345 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001346 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001347 and adjust various state variables.
1348 return 0 on success, -1 on error
1349*/
1350
1351static
1352int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1353 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001354 const char **input, const char **inend, Py_ssize_t *startinpos,
1355 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001356 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001357{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001358 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001359
1360 PyObject *restuple = NULL;
1361 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001362 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001363 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001364 Py_ssize_t requiredsize;
1365 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001366 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001367 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001368 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001369 int res = -1;
1370
1371 if (*errorHandler == NULL) {
1372 *errorHandler = PyCodec_LookupError(errors);
1373 if (*errorHandler == NULL)
1374 goto onError;
1375 }
1376
1377 if (*exceptionObject == NULL) {
1378 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001379 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001380 if (*exceptionObject == NULL)
1381 goto onError;
1382 }
1383 else {
1384 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1385 goto onError;
1386 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1387 goto onError;
1388 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1389 goto onError;
1390 }
1391
1392 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1393 if (restuple == NULL)
1394 goto onError;
1395 if (!PyTuple_Check(restuple)) {
1396 PyErr_Format(PyExc_TypeError, &argparse[4]);
1397 goto onError;
1398 }
1399 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1400 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001401
1402 /* Copy back the bytes variables, which might have been modified by the
1403 callback */
1404 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1405 if (!inputobj)
1406 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001407 if (!PyString_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001408 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1409 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001410 *input = PyString_AS_STRING(inputobj);
1411 insize = PyString_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001412 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001413 /* we can DECREF safely, as the exception has another reference,
1414 so the object won't go away. */
1415 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001417 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001418 newpos = insize+newpos;
1419 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001420 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001421 goto onError;
1422 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001423
1424 /* need more space? (at least enough for what we
1425 have+the replacement+the rest of the string (starting
1426 at the new input position), so we won't have to check space
1427 when there are no errors in the rest of the string) */
1428 repptr = PyUnicode_AS_UNICODE(repunicode);
1429 repsize = PyUnicode_GET_SIZE(repunicode);
1430 requiredsize = *outpos + repsize + insize-newpos;
1431 if (requiredsize > outsize) {
1432 if (requiredsize<2*outsize)
1433 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001434 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 goto onError;
1436 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1437 }
1438 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001439 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001440 Py_UNICODE_COPY(*outptr, repptr, repsize);
1441 *outptr += repsize;
1442 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001444 /* we made it! */
1445 res = 0;
1446
1447 onError:
1448 Py_XDECREF(restuple);
1449 return res;
1450}
1451
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001452/* --- UTF-7 Codec -------------------------------------------------------- */
1453
1454/* see RFC2152 for details */
1455
Tim Petersced69f82003-09-16 20:30:58 +00001456static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001457char utf7_special[128] = {
1458 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1459 encoded:
1460 0 - not special
1461 1 - special
1462 2 - whitespace (optional)
1463 3 - RFC2152 Set O (optional) */
1464 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1465 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1466 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1468 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1469 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1470 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1471 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1472
1473};
1474
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001475/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1476 warnings about the comparison always being false; since
1477 utf7_special[0] is 1, we can safely make that one comparison
1478 true */
1479
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001481 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001482 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001483 (encodeO && (utf7_special[(c)] == 3)))
1484
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001485#define B64(n) \
1486 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1487#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001488 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001489#define UB64(c) \
1490 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1491 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001492
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001493#define ENCODE(out, ch, bits) \
1494 while (bits >= 6) { \
1495 *out++ = B64(ch >> (bits-6)); \
1496 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001497 }
1498
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001499#define DECODE(out, ch, bits, surrogate) \
1500 while (bits >= 16) { \
1501 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1502 bits -= 16; \
1503 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001504 /* We have already generated an error for the high surrogate \
1505 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001506 surrogate = 0; \
1507 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001508 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001509 it in a 16-bit character */ \
1510 surrogate = 1; \
1511 errmsg = "code pairs are not supported"; \
1512 goto utf7Error; \
1513 } else { \
1514 *out++ = outCh; \
1515 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001516 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001517
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001518PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001519 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001520 const char *errors)
1521{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001522 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t startinpos;
1524 Py_ssize_t endinpos;
1525 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526 const char *e;
1527 PyUnicodeObject *unicode;
1528 Py_UNICODE *p;
1529 const char *errmsg = "";
1530 int inShift = 0;
1531 unsigned int bitsleft = 0;
1532 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001533 int surrogate = 0;
1534 PyObject *errorHandler = NULL;
1535 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001536
1537 unicode = _PyUnicode_New(size);
1538 if (!unicode)
1539 return NULL;
1540 if (size == 0)
1541 return (PyObject *)unicode;
1542
1543 p = unicode->str;
1544 e = s + size;
1545
1546 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 Py_UNICODE ch;
1548 restart:
1549 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550
1551 if (inShift) {
1552 if ((ch == '-') || !B64CHAR(ch)) {
1553 inShift = 0;
1554 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001555
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001556 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1557 if (bitsleft >= 6) {
1558 /* The shift sequence has a partial character in it. If
1559 bitsleft < 6 then we could just classify it as padding
1560 but that is not the case here */
1561
1562 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001563 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001564 }
1565 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001566 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567 here so indicate the potential of a misencoded character. */
1568
1569 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1570 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1571 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001572 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001573 }
1574
1575 if (ch == '-') {
1576 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001577 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001578 inShift = 1;
1579 }
1580 } else if (SPECIAL(ch,0,0)) {
1581 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001582 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 } else {
1584 *p++ = ch;
1585 }
1586 } else {
1587 charsleft = (charsleft << 6) | UB64(ch);
1588 bitsleft += 6;
1589 s++;
1590 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1591 }
1592 }
1593 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001594 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 s++;
1596 if (s < e && *s == '-') {
1597 s++;
1598 *p++ = '+';
1599 } else
1600 {
1601 inShift = 1;
1602 bitsleft = 0;
1603 }
1604 }
1605 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001606 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607 errmsg = "unexpected special character";
1608 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001609 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610 }
1611 else {
1612 *p++ = ch;
1613 s++;
1614 }
1615 continue;
1616 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001617 outpos = p-PyUnicode_AS_UNICODE(unicode);
1618 endinpos = s-starts;
1619 if (unicode_decode_call_errorhandler(
1620 errors, &errorHandler,
1621 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001622 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001623 (PyObject **)&unicode, &outpos, &p))
1624 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001625 }
1626
1627 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 outpos = p-PyUnicode_AS_UNICODE(unicode);
1629 endinpos = size;
1630 if (unicode_decode_call_errorhandler(
1631 errors, &errorHandler,
1632 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001633 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001634 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001636 if (s < e)
1637 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001638 }
1639
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001640 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001641 goto onError;
1642
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001643 Py_XDECREF(errorHandler);
1644 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001645 return (PyObject *)unicode;
1646
1647onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001648 Py_XDECREF(errorHandler);
1649 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 Py_DECREF(unicode);
1651 return NULL;
1652}
1653
1654
1655PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001656 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657 int encodeSetO,
1658 int encodeWhiteSpace,
1659 const char *errors)
1660{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001661 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001663 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001665 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001666 unsigned int bitsleft = 0;
1667 unsigned long charsleft = 0;
1668 char * out;
1669 char * start;
1670
1671 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001672 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673
Walter Dörwald51ab4142007-05-05 14:43:36 +00001674 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001675 if (v == NULL)
1676 return NULL;
1677
Walter Dörwald51ab4142007-05-05 14:43:36 +00001678 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 for (;i < size; ++i) {
1680 Py_UNICODE ch = s[i];
1681
1682 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001683 if (ch == '+') {
1684 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685 *out++ = '-';
1686 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1687 charsleft = ch;
1688 bitsleft = 16;
1689 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001690 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001692 } else {
1693 *out++ = (char) ch;
1694 }
1695 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001696 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1697 *out++ = B64(charsleft << (6-bitsleft));
1698 charsleft = 0;
1699 bitsleft = 0;
1700 /* Characters not in the BASE64 set implicitly unshift the sequence
1701 so no '-' is required, except if the character is itself a '-' */
1702 if (B64CHAR(ch) || ch == '-') {
1703 *out++ = '-';
1704 }
1705 inShift = 0;
1706 *out++ = (char) ch;
1707 } else {
1708 bitsleft += 16;
1709 charsleft = (charsleft << 16) | ch;
1710 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1711
1712 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001713 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 or '-' then the shift sequence will be terminated implicitly and we
1715 don't have to insert a '-'. */
1716
1717 if (bitsleft == 0) {
1718 if (i + 1 < size) {
1719 Py_UNICODE ch2 = s[i+1];
1720
1721 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001722
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001723 } else if (B64CHAR(ch2) || ch2 == '-') {
1724 *out++ = '-';
1725 inShift = 0;
1726 } else {
1727 inShift = 0;
1728 }
1729
1730 }
1731 else {
1732 *out++ = '-';
1733 inShift = 0;
1734 }
1735 }
Tim Petersced69f82003-09-16 20:30:58 +00001736 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001737 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001738 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001739 if (bitsleft) {
1740 *out++= B64(charsleft << (6-bitsleft) );
1741 *out++ = '-';
1742 }
1743
Guido van Rossum98297ee2007-11-06 21:34:58 +00001744 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start);
1745 Py_DECREF(v);
1746 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001747}
1748
1749#undef SPECIAL
1750#undef B64
1751#undef B64CHAR
1752#undef UB64
1753#undef ENCODE
1754#undef DECODE
1755
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756/* --- UTF-8 Codec -------------------------------------------------------- */
1757
Tim Petersced69f82003-09-16 20:30:58 +00001758static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759char utf8_code_length[256] = {
1760 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1761 illegal prefix. see RFC 2279 for details */
1762 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1763 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1764 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1765 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1766 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1767 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1768 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1769 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1770 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1771 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1772 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1773 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1774 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1775 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1776 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1777 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1778};
1779
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001781 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 const char *errors)
1783{
Walter Dörwald69652032004-09-07 20:24:22 +00001784 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1785}
1786
1787PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001788 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001789 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001790 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001791{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001794 Py_ssize_t startinpos;
1795 Py_ssize_t endinpos;
1796 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 const char *e;
1798 PyUnicodeObject *unicode;
1799 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001800 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801 PyObject *errorHandler = NULL;
1802 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803
1804 /* Note: size will always be longer than the resulting Unicode
1805 character count */
1806 unicode = _PyUnicode_New(size);
1807 if (!unicode)
1808 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001809 if (size == 0) {
1810 if (consumed)
1811 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001813 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814
1815 /* Unpack UTF-8 encoded data */
1816 p = unicode->str;
1817 e = s + size;
1818
1819 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001820 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821
1822 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001823 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824 s++;
1825 continue;
1826 }
1827
1828 n = utf8_code_length[ch];
1829
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001830 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001831 if (consumed)
1832 break;
1833 else {
1834 errmsg = "unexpected end of data";
1835 startinpos = s-starts;
1836 endinpos = size;
1837 goto utf8Error;
1838 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840
1841 switch (n) {
1842
1843 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001844 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 startinpos = s-starts;
1846 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001847 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848
1849 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001850 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001851 startinpos = s-starts;
1852 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001853 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
1855 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001856 if ((s[1] & 0xc0) != 0x80) {
1857 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 startinpos = s-starts;
1859 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001860 goto utf8Error;
1861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001863 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001864 startinpos = s-starts;
1865 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001866 errmsg = "illegal encoding";
1867 goto utf8Error;
1868 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001870 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 break;
1872
1873 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001874 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001875 (s[2] & 0xc0) != 0x80) {
1876 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877 startinpos = s-starts;
1878 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001879 goto utf8Error;
1880 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001881 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001882 if (ch < 0x0800) {
1883 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001884 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001885
1886 XXX For wide builds (UCS-4) we should probably try
1887 to recombine the surrogates into a single code
1888 unit.
1889 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001890 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001891 startinpos = s-starts;
1892 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001893 goto utf8Error;
1894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001896 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001897 break;
1898
1899 case 4:
1900 if ((s[1] & 0xc0) != 0x80 ||
1901 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001902 (s[3] & 0xc0) != 0x80) {
1903 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001904 startinpos = s-starts;
1905 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001906 goto utf8Error;
1907 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001908 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1909 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1910 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001911 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001912 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001913 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001914 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001915 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001916 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001917 startinpos = s-starts;
1918 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001919 goto utf8Error;
1920 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001921#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001922 *p++ = (Py_UNICODE)ch;
1923#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001924 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001925
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001926 /* translate from 10000..10FFFF to 0..FFFF */
1927 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001928
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001929 /* high surrogate = top 10 bits added to D800 */
1930 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001931
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001932 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001933 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001934#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935 break;
1936
1937 default:
1938 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001939 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 startinpos = s-starts;
1941 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001942 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943 }
1944 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001945 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001946
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001947 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001948 outpos = p-PyUnicode_AS_UNICODE(unicode);
1949 if (unicode_decode_call_errorhandler(
1950 errors, &errorHandler,
1951 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001952 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001953 (PyObject **)&unicode, &outpos, &p))
1954 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 }
Walter Dörwald69652032004-09-07 20:24:22 +00001956 if (consumed)
1957 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001958
1959 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001960 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961 goto onError;
1962
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001963 Py_XDECREF(errorHandler);
1964 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 return (PyObject *)unicode;
1966
1967onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 Py_XDECREF(errorHandler);
1969 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970 Py_DECREF(unicode);
1971 return NULL;
1972}
1973
Tim Peters602f7402002-04-27 18:03:26 +00001974/* Allocation strategy: if the string is short, convert into a stack buffer
1975 and allocate exactly as much space needed at the end. Else allocate the
1976 maximum possible needed (4 result bytes per Unicode character), and return
1977 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001978*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001979PyObject *
1980PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001981 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001982 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983{
Tim Peters602f7402002-04-27 18:03:26 +00001984#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001985
Guido van Rossum98297ee2007-11-06 21:34:58 +00001986 Py_ssize_t i; /* index into s of next input byte */
1987 PyObject *result; /* result string object */
1988 char *p; /* next free byte in output buffer */
1989 Py_ssize_t nallocated; /* number of result bytes allocated */
1990 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001991 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001992
Tim Peters602f7402002-04-27 18:03:26 +00001993 assert(s != NULL);
1994 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995
Tim Peters602f7402002-04-27 18:03:26 +00001996 if (size <= MAX_SHORT_UNICHARS) {
1997 /* Write into the stack buffer; nallocated can't overflow.
1998 * At the end, we'll allocate exactly as much heap space as it
1999 * turns out we need.
2000 */
2001 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002002 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002003 p = stackbuf;
2004 }
2005 else {
2006 /* Overallocate on the heap, and give the excess back at the end. */
2007 nallocated = size * 4;
2008 if (nallocated / 4 != size) /* overflow! */
2009 return PyErr_NoMemory();
Guido van Rossum98297ee2007-11-06 21:34:58 +00002010 result = PyString_FromStringAndSize(NULL, nallocated);
2011 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002012 return NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00002013 p = PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002014 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002015
Tim Peters602f7402002-04-27 18:03:26 +00002016 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002017 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002018
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002019 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002020 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002022
Guido van Rossumd57fd912000-03-10 22:53:23 +00002023 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002024 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002025 *p++ = (char)(0xc0 | (ch >> 6));
2026 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002027 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002028 else {
Tim Peters602f7402002-04-27 18:03:26 +00002029 /* Encode UCS2 Unicode ordinals */
2030 if (ch < 0x10000) {
2031 /* Special case: check for high surrogate */
2032 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2033 Py_UCS4 ch2 = s[i];
2034 /* Check for low surrogate and combine the two to
2035 form a UCS4 value */
2036 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002037 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002038 i++;
2039 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002040 }
Tim Peters602f7402002-04-27 18:03:26 +00002041 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002042 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002043 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002044 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2045 *p++ = (char)(0x80 | (ch & 0x3f));
2046 continue;
2047 }
2048encodeUCS4:
2049 /* Encode UCS4 Unicode ordinals */
2050 *p++ = (char)(0xf0 | (ch >> 18));
2051 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2052 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2053 *p++ = (char)(0x80 | (ch & 0x3f));
2054 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002056
Guido van Rossum98297ee2007-11-06 21:34:58 +00002057 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002058 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002059 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002060 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002061 result = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002062 }
2063 else {
2064 /* Cut back to size actually needed. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00002065 nneeded = p - PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002066 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002067 _PyString_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002068 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002069 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002070
Tim Peters602f7402002-04-27 18:03:26 +00002071#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072}
2073
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2075{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076 if (!PyUnicode_Check(unicode)) {
2077 PyErr_BadArgument();
2078 return NULL;
2079 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002080 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2081 PyUnicode_GET_SIZE(unicode),
2082 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002083}
2084
Walter Dörwald41980ca2007-08-16 21:55:45 +00002085/* --- UTF-32 Codec ------------------------------------------------------- */
2086
2087PyObject *
2088PyUnicode_DecodeUTF32(const char *s,
2089 Py_ssize_t size,
2090 const char *errors,
2091 int *byteorder)
2092{
2093 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2094}
2095
2096PyObject *
2097PyUnicode_DecodeUTF32Stateful(const char *s,
2098 Py_ssize_t size,
2099 const char *errors,
2100 int *byteorder,
2101 Py_ssize_t *consumed)
2102{
2103 const char *starts = s;
2104 Py_ssize_t startinpos;
2105 Py_ssize_t endinpos;
2106 Py_ssize_t outpos;
2107 PyUnicodeObject *unicode;
2108 Py_UNICODE *p;
2109#ifndef Py_UNICODE_WIDE
2110 int i, pairs;
2111#else
2112 const int pairs = 0;
2113#endif
2114 const unsigned char *q, *e;
2115 int bo = 0; /* assume native ordering by default */
2116 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002117 /* Offsets from q for retrieving bytes in the right order. */
2118#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2119 int iorder[] = {0, 1, 2, 3};
2120#else
2121 int iorder[] = {3, 2, 1, 0};
2122#endif
2123 PyObject *errorHandler = NULL;
2124 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002125 /* On narrow builds we split characters outside the BMP into two
2126 codepoints => count how much extra space we need. */
2127#ifndef Py_UNICODE_WIDE
2128 for (i = pairs = 0; i < size/4; i++)
2129 if (((Py_UCS4 *)s)[i] >= 0x10000)
2130 pairs++;
2131#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002132
2133 /* This might be one to much, because of a BOM */
2134 unicode = _PyUnicode_New((size+3)/4+pairs);
2135 if (!unicode)
2136 return NULL;
2137 if (size == 0)
2138 return (PyObject *)unicode;
2139
2140 /* Unpack UTF-32 encoded data */
2141 p = unicode->str;
2142 q = (unsigned char *)s;
2143 e = q + size;
2144
2145 if (byteorder)
2146 bo = *byteorder;
2147
2148 /* Check for BOM marks (U+FEFF) in the input and adjust current
2149 byte order setting accordingly. In native mode, the leading BOM
2150 mark is skipped, in all other modes, it is copied to the output
2151 stream as-is (giving a ZWNBSP character). */
2152 if (bo == 0) {
2153 if (size >= 4) {
2154 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2155 (q[iorder[1]] << 8) | q[iorder[0]];
2156#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2157 if (bom == 0x0000FEFF) {
2158 q += 4;
2159 bo = -1;
2160 }
2161 else if (bom == 0xFFFE0000) {
2162 q += 4;
2163 bo = 1;
2164 }
2165#else
2166 if (bom == 0x0000FEFF) {
2167 q += 4;
2168 bo = 1;
2169 }
2170 else if (bom == 0xFFFE0000) {
2171 q += 4;
2172 bo = -1;
2173 }
2174#endif
2175 }
2176 }
2177
2178 if (bo == -1) {
2179 /* force LE */
2180 iorder[0] = 0;
2181 iorder[1] = 1;
2182 iorder[2] = 2;
2183 iorder[3] = 3;
2184 }
2185 else if (bo == 1) {
2186 /* force BE */
2187 iorder[0] = 3;
2188 iorder[1] = 2;
2189 iorder[2] = 1;
2190 iorder[3] = 0;
2191 }
2192
2193 while (q < e) {
2194 Py_UCS4 ch;
2195 /* remaining bytes at the end? (size should be divisible by 4) */
2196 if (e-q<4) {
2197 if (consumed)
2198 break;
2199 errmsg = "truncated data";
2200 startinpos = ((const char *)q)-starts;
2201 endinpos = ((const char *)e)-starts;
2202 goto utf32Error;
2203 /* The remaining input chars are ignored if the callback
2204 chooses to skip the input */
2205 }
2206 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2207 (q[iorder[1]] << 8) | q[iorder[0]];
2208
2209 if (ch >= 0x110000)
2210 {
2211 errmsg = "codepoint not in range(0x110000)";
2212 startinpos = ((const char *)q)-starts;
2213 endinpos = startinpos+4;
2214 goto utf32Error;
2215 }
2216#ifndef Py_UNICODE_WIDE
2217 if (ch >= 0x10000)
2218 {
2219 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2220 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2221 }
2222 else
2223#endif
2224 *p++ = ch;
2225 q += 4;
2226 continue;
2227 utf32Error:
2228 outpos = p-PyUnicode_AS_UNICODE(unicode);
2229 if (unicode_decode_call_errorhandler(
2230 errors, &errorHandler,
2231 "utf32", errmsg,
2232 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2233 (PyObject **)&unicode, &outpos, &p))
2234 goto onError;
2235 }
2236
2237 if (byteorder)
2238 *byteorder = bo;
2239
2240 if (consumed)
2241 *consumed = (const char *)q-starts;
2242
2243 /* Adjust length */
2244 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2245 goto onError;
2246
2247 Py_XDECREF(errorHandler);
2248 Py_XDECREF(exc);
2249 return (PyObject *)unicode;
2250
2251onError:
2252 Py_DECREF(unicode);
2253 Py_XDECREF(errorHandler);
2254 Py_XDECREF(exc);
2255 return NULL;
2256}
2257
2258PyObject *
2259PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2260 Py_ssize_t size,
2261 const char *errors,
2262 int byteorder)
2263{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002264 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002265 unsigned char *p;
2266#ifndef Py_UNICODE_WIDE
2267 int i, pairs;
2268#else
2269 const int pairs = 0;
2270#endif
2271 /* Offsets from p for storing byte pairs in the right order. */
2272#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2273 int iorder[] = {0, 1, 2, 3};
2274#else
2275 int iorder[] = {3, 2, 1, 0};
2276#endif
2277
2278#define STORECHAR(CH) \
2279 do { \
2280 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2281 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2282 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2283 p[iorder[0]] = (CH) & 0xff; \
2284 p += 4; \
2285 } while(0)
2286
2287 /* In narrow builds we can output surrogate pairs as one codepoint,
2288 so we need less space. */
2289#ifndef Py_UNICODE_WIDE
2290 for (i = pairs = 0; i < size-1; i++)
2291 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2292 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2293 pairs++;
2294#endif
2295 v = PyBytes_FromStringAndSize(NULL,
2296 4 * (size - pairs + (byteorder == 0)));
2297 if (v == NULL)
2298 return NULL;
2299
2300 p = (unsigned char *)PyBytes_AS_STRING(v);
2301 if (byteorder == 0)
2302 STORECHAR(0xFEFF);
2303 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002304 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002305
2306 if (byteorder == -1) {
2307 /* force LE */
2308 iorder[0] = 0;
2309 iorder[1] = 1;
2310 iorder[2] = 2;
2311 iorder[3] = 3;
2312 }
2313 else if (byteorder == 1) {
2314 /* force BE */
2315 iorder[0] = 3;
2316 iorder[1] = 2;
2317 iorder[2] = 1;
2318 iorder[3] = 0;
2319 }
2320
2321 while (size-- > 0) {
2322 Py_UCS4 ch = *s++;
2323#ifndef Py_UNICODE_WIDE
2324 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2325 Py_UCS4 ch2 = *s;
2326 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2327 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2328 s++;
2329 size--;
2330 }
2331 }
2332#endif
2333 STORECHAR(ch);
2334 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002335
2336 done:
2337 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
2338 Py_DECREF(v);
2339 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002340#undef STORECHAR
2341}
2342
2343PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2344{
2345 if (!PyUnicode_Check(unicode)) {
2346 PyErr_BadArgument();
2347 return NULL;
2348 }
2349 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2350 PyUnicode_GET_SIZE(unicode),
2351 NULL,
2352 0);
2353}
2354
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355/* --- UTF-16 Codec ------------------------------------------------------- */
2356
Tim Peters772747b2001-08-09 22:21:55 +00002357PyObject *
2358PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002359 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002360 const char *errors,
2361 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362{
Walter Dörwald69652032004-09-07 20:24:22 +00002363 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2364}
2365
2366PyObject *
2367PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002368 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002369 const char *errors,
2370 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002371 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002372{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002373 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002374 Py_ssize_t startinpos;
2375 Py_ssize_t endinpos;
2376 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377 PyUnicodeObject *unicode;
2378 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002379 const unsigned char *q, *e;
2380 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002381 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002382 /* Offsets from q for retrieving byte pairs in the right order. */
2383#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2384 int ihi = 1, ilo = 0;
2385#else
2386 int ihi = 0, ilo = 1;
2387#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002388 PyObject *errorHandler = NULL;
2389 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390
2391 /* Note: size will always be longer than the resulting Unicode
2392 character count */
2393 unicode = _PyUnicode_New(size);
2394 if (!unicode)
2395 return NULL;
2396 if (size == 0)
2397 return (PyObject *)unicode;
2398
2399 /* Unpack UTF-16 encoded data */
2400 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002401 q = (unsigned char *)s;
2402 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403
2404 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002405 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002407 /* Check for BOM marks (U+FEFF) in the input and adjust current
2408 byte order setting accordingly. In native mode, the leading BOM
2409 mark is skipped, in all other modes, it is copied to the output
2410 stream as-is (giving a ZWNBSP character). */
2411 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002412 if (size >= 2) {
2413 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002414#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002415 if (bom == 0xFEFF) {
2416 q += 2;
2417 bo = -1;
2418 }
2419 else if (bom == 0xFFFE) {
2420 q += 2;
2421 bo = 1;
2422 }
Tim Petersced69f82003-09-16 20:30:58 +00002423#else
Walter Dörwald69652032004-09-07 20:24:22 +00002424 if (bom == 0xFEFF) {
2425 q += 2;
2426 bo = 1;
2427 }
2428 else if (bom == 0xFFFE) {
2429 q += 2;
2430 bo = -1;
2431 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002432#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002433 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002434 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435
Tim Peters772747b2001-08-09 22:21:55 +00002436 if (bo == -1) {
2437 /* force LE */
2438 ihi = 1;
2439 ilo = 0;
2440 }
2441 else if (bo == 1) {
2442 /* force BE */
2443 ihi = 0;
2444 ilo = 1;
2445 }
2446
2447 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002448 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002449 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002450 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002451 if (consumed)
2452 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002453 errmsg = "truncated data";
2454 startinpos = ((const char *)q)-starts;
2455 endinpos = ((const char *)e)-starts;
2456 goto utf16Error;
2457 /* The remaining input chars are ignored if the callback
2458 chooses to skip the input */
2459 }
2460 ch = (q[ihi] << 8) | q[ilo];
2461
Tim Peters772747b2001-08-09 22:21:55 +00002462 q += 2;
2463
Guido van Rossumd57fd912000-03-10 22:53:23 +00002464 if (ch < 0xD800 || ch > 0xDFFF) {
2465 *p++ = ch;
2466 continue;
2467 }
2468
2469 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002470 if (q >= e) {
2471 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002472 startinpos = (((const char *)q)-2)-starts;
2473 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002474 goto utf16Error;
2475 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002476 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002477 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2478 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002479 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002480#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002481 *p++ = ch;
2482 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002483#else
2484 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002485#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002486 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002487 }
2488 else {
2489 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002490 startinpos = (((const char *)q)-4)-starts;
2491 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002492 goto utf16Error;
2493 }
2494
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002496 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002497 startinpos = (((const char *)q)-2)-starts;
2498 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002499 /* Fall through to report the error */
2500
2501 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002502 outpos = p-PyUnicode_AS_UNICODE(unicode);
2503 if (unicode_decode_call_errorhandler(
2504 errors, &errorHandler,
2505 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002506 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002507 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002508 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 }
2510
2511 if (byteorder)
2512 *byteorder = bo;
2513
Walter Dörwald69652032004-09-07 20:24:22 +00002514 if (consumed)
2515 *consumed = (const char *)q-starts;
2516
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002518 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 goto onError;
2520
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002521 Py_XDECREF(errorHandler);
2522 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 return (PyObject *)unicode;
2524
2525onError:
2526 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002527 Py_XDECREF(errorHandler);
2528 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 return NULL;
2530}
2531
Tim Peters772747b2001-08-09 22:21:55 +00002532PyObject *
2533PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002534 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002535 const char *errors,
2536 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002538 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002539 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002540#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002541 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002542#else
2543 const int pairs = 0;
2544#endif
Tim Peters772747b2001-08-09 22:21:55 +00002545 /* Offsets from p for storing byte pairs in the right order. */
2546#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2547 int ihi = 1, ilo = 0;
2548#else
2549 int ihi = 0, ilo = 1;
2550#endif
2551
2552#define STORECHAR(CH) \
2553 do { \
2554 p[ihi] = ((CH) >> 8) & 0xff; \
2555 p[ilo] = (CH) & 0xff; \
2556 p += 2; \
2557 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002559#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002560 for (i = pairs = 0; i < size; i++)
2561 if (s[i] >= 0x10000)
2562 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002563#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002564 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002565 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 if (v == NULL)
2567 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568
Walter Dörwald3cc34522007-05-04 10:48:27 +00002569 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002571 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002572 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002573 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002574
2575 if (byteorder == -1) {
2576 /* force LE */
2577 ihi = 1;
2578 ilo = 0;
2579 }
2580 else if (byteorder == 1) {
2581 /* force BE */
2582 ihi = 0;
2583 ilo = 1;
2584 }
2585
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002586 while (size-- > 0) {
2587 Py_UNICODE ch = *s++;
2588 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002589#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002590 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002591 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2592 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002594#endif
Tim Peters772747b2001-08-09 22:21:55 +00002595 STORECHAR(ch);
2596 if (ch2)
2597 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002598 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002599
2600 done:
2601 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
2602 Py_DECREF(v);
2603 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002604#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605}
2606
2607PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2608{
2609 if (!PyUnicode_Check(unicode)) {
2610 PyErr_BadArgument();
2611 return NULL;
2612 }
2613 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2614 PyUnicode_GET_SIZE(unicode),
2615 NULL,
2616 0);
2617}
2618
2619/* --- Unicode Escape Codec ----------------------------------------------- */
2620
Fredrik Lundh06d12682001-01-24 07:59:11 +00002621static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002622
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002624 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 const char *errors)
2626{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002628 Py_ssize_t startinpos;
2629 Py_ssize_t endinpos;
2630 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002631 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002633 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002635 char* message;
2636 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002637 PyObject *errorHandler = NULL;
2638 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002639
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 /* Escaped strings will always be longer than the resulting
2641 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002642 length after conversion to the true value.
2643 (but if the error callback returns a long replacement string
2644 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 v = _PyUnicode_New(size);
2646 if (v == NULL)
2647 goto onError;
2648 if (size == 0)
2649 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002650
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002651 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002653
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654 while (s < end) {
2655 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002656 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002657 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658
2659 /* Non-escape characters are interpreted as Unicode ordinals */
2660 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002661 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 continue;
2663 }
2664
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002665 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666 /* \ - Escapes */
2667 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002668 c = *s++;
2669 if (s > end)
2670 c = '\0'; /* Invalid after \ */
2671 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672
2673 /* \x escapes */
2674 case '\n': break;
2675 case '\\': *p++ = '\\'; break;
2676 case '\'': *p++ = '\''; break;
2677 case '\"': *p++ = '\"'; break;
2678 case 'b': *p++ = '\b'; break;
2679 case 'f': *p++ = '\014'; break; /* FF */
2680 case 't': *p++ = '\t'; break;
2681 case 'n': *p++ = '\n'; break;
2682 case 'r': *p++ = '\r'; break;
2683 case 'v': *p++ = '\013'; break; /* VT */
2684 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2685
2686 /* \OOO (octal) escapes */
2687 case '0': case '1': case '2': case '3':
2688 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002689 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002690 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002691 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002692 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002693 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002695 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 break;
2697
Fredrik Lundhccc74732001-02-18 22:13:49 +00002698 /* hex escapes */
2699 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002701 digits = 2;
2702 message = "truncated \\xXX escape";
2703 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704
Fredrik Lundhccc74732001-02-18 22:13:49 +00002705 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002707 digits = 4;
2708 message = "truncated \\uXXXX escape";
2709 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710
Fredrik Lundhccc74732001-02-18 22:13:49 +00002711 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002712 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002713 digits = 8;
2714 message = "truncated \\UXXXXXXXX escape";
2715 hexescape:
2716 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002717 outpos = p-PyUnicode_AS_UNICODE(v);
2718 if (s+digits>end) {
2719 endinpos = size;
2720 if (unicode_decode_call_errorhandler(
2721 errors, &errorHandler,
2722 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002723 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724 (PyObject **)&v, &outpos, &p))
2725 goto onError;
2726 goto nextByte;
2727 }
2728 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002729 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002730 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 endinpos = (s+i+1)-starts;
2732 if (unicode_decode_call_errorhandler(
2733 errors, &errorHandler,
2734 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002735 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002736 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002737 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002739 }
2740 chr = (chr<<4) & ~0xF;
2741 if (c >= '0' && c <= '9')
2742 chr += c - '0';
2743 else if (c >= 'a' && c <= 'f')
2744 chr += 10 + c - 'a';
2745 else
2746 chr += 10 + c - 'A';
2747 }
2748 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002749 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002750 /* _decoding_error will have already written into the
2751 target buffer. */
2752 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002753 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002754 /* when we get here, chr is a 32-bit unicode character */
2755 if (chr <= 0xffff)
2756 /* UCS-2 character */
2757 *p++ = (Py_UNICODE) chr;
2758 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002759 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002760 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002761#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002762 *p++ = chr;
2763#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002764 chr -= 0x10000L;
2765 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002766 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002767#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002768 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002769 endinpos = s-starts;
2770 outpos = p-PyUnicode_AS_UNICODE(v);
2771 if (unicode_decode_call_errorhandler(
2772 errors, &errorHandler,
2773 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002774 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002775 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002776 goto onError;
2777 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002778 break;
2779
2780 /* \N{name} */
2781 case 'N':
2782 message = "malformed \\N character escape";
2783 if (ucnhash_CAPI == NULL) {
2784 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002785 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002786 m = PyImport_ImportModule("unicodedata");
2787 if (m == NULL)
2788 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002789 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002790 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002791 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002792 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002793 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002794 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002795 if (ucnhash_CAPI == NULL)
2796 goto ucnhashError;
2797 }
2798 if (*s == '{') {
2799 const char *start = s+1;
2800 /* look for the closing brace */
2801 while (*s != '}' && s < end)
2802 s++;
2803 if (s > start && s < end && *s == '}') {
2804 /* found a name. look it up in the unicode database */
2805 message = "unknown Unicode character name";
2806 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002807 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002808 goto store;
2809 }
2810 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002811 endinpos = s-starts;
2812 outpos = p-PyUnicode_AS_UNICODE(v);
2813 if (unicode_decode_call_errorhandler(
2814 errors, &errorHandler,
2815 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002816 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002818 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002819 break;
2820
2821 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002822 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002823 message = "\\ at end of string";
2824 s--;
2825 endinpos = s-starts;
2826 outpos = p-PyUnicode_AS_UNICODE(v);
2827 if (unicode_decode_call_errorhandler(
2828 errors, &errorHandler,
2829 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002830 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002832 goto onError;
2833 }
2834 else {
2835 *p++ = '\\';
2836 *p++ = (unsigned char)s[-1];
2837 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002838 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002840 nextByte:
2841 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002842 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002843 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002844 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002845 Py_XDECREF(errorHandler);
2846 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002848
Fredrik Lundhccc74732001-02-18 22:13:49 +00002849ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002850 PyErr_SetString(
2851 PyExc_UnicodeError,
2852 "\\N escapes not supported (can't load unicodedata module)"
2853 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002854 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002855 Py_XDECREF(errorHandler);
2856 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002857 return NULL;
2858
Fredrik Lundhccc74732001-02-18 22:13:49 +00002859onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861 Py_XDECREF(errorHandler);
2862 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863 return NULL;
2864}
2865
2866/* Return a Unicode-Escape string version of the Unicode object.
2867
2868 If quotes is true, the string is enclosed in u"" or u'' quotes as
2869 appropriate.
2870
2871*/
2872
Thomas Wouters477c8d52006-05-27 19:21:47 +00002873Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2874 Py_ssize_t size,
2875 Py_UNICODE ch)
2876{
2877 /* like wcschr, but doesn't stop at NULL characters */
2878
2879 while (size-- > 0) {
2880 if (*s == ch)
2881 return s;
2882 s++;
2883 }
2884
2885 return NULL;
2886}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002887
Walter Dörwald79e913e2007-05-12 11:08:06 +00002888static const char *hexdigits = "0123456789abcdef";
2889
2890PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2891 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002893 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895
Thomas Wouters89f507f2006-12-13 04:49:30 +00002896 /* XXX(nnorwitz): rather than over-allocating, it would be
2897 better to choose a different scheme. Perhaps scan the
2898 first N-chars of the string and allocate based on that size.
2899 */
2900 /* Initial allocation is based on the longest-possible unichr
2901 escape.
2902
2903 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2904 unichr, so in this case it's the longest unichr escape. In
2905 narrow (UTF-16) builds this is five chars per source unichr
2906 since there are two unichrs in the surrogate pair, so in narrow
2907 (UTF-16) builds it's not the longest unichr escape.
2908
2909 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2910 so in the narrow (UTF-16) build case it's the longest unichr
2911 escape.
2912 */
2913
Walter Dörwald79e913e2007-05-12 11:08:06 +00002914 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002915#ifdef Py_UNICODE_WIDE
2916 + 10*size
2917#else
2918 + 6*size
2919#endif
2920 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921 if (repr == NULL)
2922 return NULL;
2923
Walter Dörwald79e913e2007-05-12 11:08:06 +00002924 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 while (size-- > 0) {
2927 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002928
Walter Dörwald79e913e2007-05-12 11:08:06 +00002929 /* Escape backslashes */
2930 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002931 *p++ = '\\';
2932 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002933 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002934 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002935
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002936#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002937 /* Map 21-bit characters to '\U00xxxxxx' */
2938 else if (ch >= 0x10000) {
2939 *p++ = '\\';
2940 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002941 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2942 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2943 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2944 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2945 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2946 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2947 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2948 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002949 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002950 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002951#else
2952 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002953 else if (ch >= 0xD800 && ch < 0xDC00) {
2954 Py_UNICODE ch2;
2955 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002956
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002957 ch2 = *s++;
2958 size--;
2959 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2960 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2961 *p++ = '\\';
2962 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002963 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2964 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2965 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2966 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2967 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2968 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2969 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2970 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002971 continue;
2972 }
2973 /* Fall through: isolated surrogates are copied as-is */
2974 s--;
2975 size++;
2976 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002977#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002978
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002980 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 *p++ = '\\';
2982 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002983 *p++ = hexdigits[(ch >> 12) & 0x000F];
2984 *p++ = hexdigits[(ch >> 8) & 0x000F];
2985 *p++ = hexdigits[(ch >> 4) & 0x000F];
2986 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002988
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002989 /* Map special whitespace to '\t', \n', '\r' */
2990 else if (ch == '\t') {
2991 *p++ = '\\';
2992 *p++ = 't';
2993 }
2994 else if (ch == '\n') {
2995 *p++ = '\\';
2996 *p++ = 'n';
2997 }
2998 else if (ch == '\r') {
2999 *p++ = '\\';
3000 *p++ = 'r';
3001 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003002
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003003 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003004 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003005 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003006 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003007 *p++ = hexdigits[(ch >> 4) & 0x000F];
3008 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003009 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003010
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 /* Copy everything else as-is */
3012 else
3013 *p++ = (char) ch;
3014 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015
Guido van Rossum98297ee2007-11-06 21:34:58 +00003016 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr),
3017 p - PyBytes_AS_STRING(repr));
3018 Py_DECREF(repr);
3019 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020}
3021
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3023{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003024 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025 if (!PyUnicode_Check(unicode)) {
3026 PyErr_BadArgument();
3027 return NULL;
3028 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003029 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3030 PyUnicode_GET_SIZE(unicode));
3031
3032 if (!s)
3033 return NULL;
3034 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3035 PyBytes_GET_SIZE(s));
3036 Py_DECREF(s);
3037 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038}
3039
3040/* --- Raw Unicode Escape Codec ------------------------------------------- */
3041
3042PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003043 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 const char *errors)
3045{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003046 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003047 Py_ssize_t startinpos;
3048 Py_ssize_t endinpos;
3049 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 const char *end;
3053 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054 PyObject *errorHandler = NULL;
3055 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003056
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 /* Escaped strings will always be longer than the resulting
3058 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 length after conversion to the true value. (But decoding error
3060 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 v = _PyUnicode_New(size);
3062 if (v == NULL)
3063 goto onError;
3064 if (size == 0)
3065 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 end = s + size;
3068 while (s < end) {
3069 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003070 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003072 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073
3074 /* Non-escape characters are interpreted as Unicode ordinals */
3075 if (*s != '\\') {
3076 *p++ = (unsigned char)*s++;
3077 continue;
3078 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080
3081 /* \u-escapes are only interpreted iff the number of leading
3082 backslashes if odd */
3083 bs = s;
3084 for (;s < end;) {
3085 if (*s != '\\')
3086 break;
3087 *p++ = (unsigned char)*s++;
3088 }
3089 if (((s - bs) & 1) == 0 ||
3090 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003091 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092 continue;
3093 }
3094 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003095 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 s++;
3097
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003098 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003099 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003100 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003101 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003102 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003103 endinpos = s-starts;
3104 if (unicode_decode_call_errorhandler(
3105 errors, &errorHandler,
3106 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003107 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003110 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 }
3112 x = (x<<4) & ~0xF;
3113 if (c >= '0' && c <= '9')
3114 x += c - '0';
3115 else if (c >= 'a' && c <= 'f')
3116 x += 10 + c - 'a';
3117 else
3118 x += 10 + c - 'A';
3119 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003120#ifndef Py_UNICODE_WIDE
3121 if (x > 0x10000) {
3122 if (unicode_decode_call_errorhandler(
3123 errors, &errorHandler,
3124 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003125 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003126 (PyObject **)&v, &outpos, &p))
3127 goto onError;
3128 }
3129#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003130 *p++ = x;
3131 nextByte:
3132 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003134 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003135 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003136 Py_XDECREF(errorHandler);
3137 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003138 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003139
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140 onError:
3141 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003142 Py_XDECREF(errorHandler);
3143 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003144 return NULL;
3145}
3146
3147PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003148 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003150 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151 char *p;
3152 char *q;
3153
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003154#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003155 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003156#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003157 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003158#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 if (repr == NULL)
3160 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003161 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003162 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163
Walter Dörwald711005d2007-05-12 12:03:26 +00003164 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 while (size-- > 0) {
3166 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003167#ifdef Py_UNICODE_WIDE
3168 /* Map 32-bit characters to '\Uxxxxxxxx' */
3169 if (ch >= 0x10000) {
3170 *p++ = '\\';
3171 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003172 *p++ = hexdigits[(ch >> 28) & 0xf];
3173 *p++ = hexdigits[(ch >> 24) & 0xf];
3174 *p++ = hexdigits[(ch >> 20) & 0xf];
3175 *p++ = hexdigits[(ch >> 16) & 0xf];
3176 *p++ = hexdigits[(ch >> 12) & 0xf];
3177 *p++ = hexdigits[(ch >> 8) & 0xf];
3178 *p++ = hexdigits[(ch >> 4) & 0xf];
3179 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003180 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003181 else
3182#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183 /* Map 16-bit characters to '\uxxxx' */
3184 if (ch >= 256) {
3185 *p++ = '\\';
3186 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003187 *p++ = hexdigits[(ch >> 12) & 0xf];
3188 *p++ = hexdigits[(ch >> 8) & 0xf];
3189 *p++ = hexdigits[(ch >> 4) & 0xf];
3190 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 }
3192 /* Copy everything else as-is */
3193 else
3194 *p++ = (char) ch;
3195 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003196 size = p - q;
3197
3198 done:
3199 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size);
3200 Py_DECREF(repr);
3201 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202}
3203
3204PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3205{
Walter Dörwald711005d2007-05-12 12:03:26 +00003206 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003208 PyErr_BadArgument();
3209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003211 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3212 PyUnicode_GET_SIZE(unicode));
3213
3214 if (!s)
3215 return NULL;
3216 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3217 PyBytes_GET_SIZE(s));
3218 Py_DECREF(s);
3219 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220}
3221
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003222/* --- Unicode Internal Codec ------------------------------------------- */
3223
3224PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003225 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003226 const char *errors)
3227{
3228 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003229 Py_ssize_t startinpos;
3230 Py_ssize_t endinpos;
3231 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003232 PyUnicodeObject *v;
3233 Py_UNICODE *p;
3234 const char *end;
3235 const char *reason;
3236 PyObject *errorHandler = NULL;
3237 PyObject *exc = NULL;
3238
Neal Norwitzd43069c2006-01-08 01:12:10 +00003239#ifdef Py_UNICODE_WIDE
3240 Py_UNICODE unimax = PyUnicode_GetMax();
3241#endif
3242
Thomas Wouters89f507f2006-12-13 04:49:30 +00003243 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003244 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3245 if (v == NULL)
3246 goto onError;
3247 if (PyUnicode_GetSize((PyObject *)v) == 0)
3248 return (PyObject *)v;
3249 p = PyUnicode_AS_UNICODE(v);
3250 end = s + size;
3251
3252 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003253 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003254 /* We have to sanity check the raw data, otherwise doom looms for
3255 some malformed UCS-4 data. */
3256 if (
3257 #ifdef Py_UNICODE_WIDE
3258 *p > unimax || *p < 0 ||
3259 #endif
3260 end-s < Py_UNICODE_SIZE
3261 )
3262 {
3263 startinpos = s - starts;
3264 if (end-s < Py_UNICODE_SIZE) {
3265 endinpos = end-starts;
3266 reason = "truncated input";
3267 }
3268 else {
3269 endinpos = s - starts + Py_UNICODE_SIZE;
3270 reason = "illegal code point (> 0x10FFFF)";
3271 }
3272 outpos = p - PyUnicode_AS_UNICODE(v);
3273 if (unicode_decode_call_errorhandler(
3274 errors, &errorHandler,
3275 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003276 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003277 (PyObject **)&v, &outpos, &p)) {
3278 goto onError;
3279 }
3280 }
3281 else {
3282 p++;
3283 s += Py_UNICODE_SIZE;
3284 }
3285 }
3286
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003287 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003288 goto onError;
3289 Py_XDECREF(errorHandler);
3290 Py_XDECREF(exc);
3291 return (PyObject *)v;
3292
3293 onError:
3294 Py_XDECREF(v);
3295 Py_XDECREF(errorHandler);
3296 Py_XDECREF(exc);
3297 return NULL;
3298}
3299
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300/* --- Latin-1 Codec ------------------------------------------------------ */
3301
3302PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003303 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 const char *errors)
3305{
3306 PyUnicodeObject *v;
3307 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003308
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003310 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003311 Py_UNICODE r = *(unsigned char*)s;
3312 return PyUnicode_FromUnicode(&r, 1);
3313 }
3314
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 v = _PyUnicode_New(size);
3316 if (v == NULL)
3317 goto onError;
3318 if (size == 0)
3319 return (PyObject *)v;
3320 p = PyUnicode_AS_UNICODE(v);
3321 while (size-- > 0)
3322 *p++ = (unsigned char)*s++;
3323 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003324
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 onError:
3326 Py_XDECREF(v);
3327 return NULL;
3328}
3329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330/* create or adjust a UnicodeEncodeError */
3331static void make_encode_exception(PyObject **exceptionObject,
3332 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003333 const Py_UNICODE *unicode, Py_ssize_t size,
3334 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 if (*exceptionObject == NULL) {
3338 *exceptionObject = PyUnicodeEncodeError_Create(
3339 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 }
3341 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3343 goto onError;
3344 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3345 goto onError;
3346 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3347 goto onError;
3348 return;
3349 onError:
3350 Py_DECREF(*exceptionObject);
3351 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 }
3353}
3354
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355/* raises a UnicodeEncodeError */
3356static void raise_encode_exception(PyObject **exceptionObject,
3357 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003358 const Py_UNICODE *unicode, Py_ssize_t size,
3359 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003360 const char *reason)
3361{
3362 make_encode_exception(exceptionObject,
3363 encoding, unicode, size, startpos, endpos, reason);
3364 if (*exceptionObject != NULL)
3365 PyCodec_StrictErrors(*exceptionObject);
3366}
3367
3368/* error handling callback helper:
3369 build arguments, call the callback and check the arguments,
3370 put the result into newpos and return the replacement string, which
3371 has to be freed by the caller */
3372static PyObject *unicode_encode_call_errorhandler(const char *errors,
3373 PyObject **errorHandler,
3374 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003375 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3376 Py_ssize_t startpos, Py_ssize_t endpos,
3377 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003379 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003380
3381 PyObject *restuple;
3382 PyObject *resunicode;
3383
3384 if (*errorHandler == NULL) {
3385 *errorHandler = PyCodec_LookupError(errors);
3386 if (*errorHandler == NULL)
3387 return NULL;
3388 }
3389
3390 make_encode_exception(exceptionObject,
3391 encoding, unicode, size, startpos, endpos, reason);
3392 if (*exceptionObject == NULL)
3393 return NULL;
3394
3395 restuple = PyObject_CallFunctionObjArgs(
3396 *errorHandler, *exceptionObject, NULL);
3397 if (restuple == NULL)
3398 return NULL;
3399 if (!PyTuple_Check(restuple)) {
3400 PyErr_Format(PyExc_TypeError, &argparse[4]);
3401 Py_DECREF(restuple);
3402 return NULL;
3403 }
3404 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3405 &resunicode, newpos)) {
3406 Py_DECREF(restuple);
3407 return NULL;
3408 }
3409 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003410 *newpos = size+*newpos;
3411 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003412 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003413 Py_DECREF(restuple);
3414 return NULL;
3415 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003416 Py_INCREF(resunicode);
3417 Py_DECREF(restuple);
3418 return resunicode;
3419}
3420
3421static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003422 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423 const char *errors,
3424 int limit)
3425{
3426 /* output object */
3427 PyObject *res;
3428 /* pointers to the beginning and end+1 of input */
3429 const Py_UNICODE *startp = p;
3430 const Py_UNICODE *endp = p + size;
3431 /* pointer to the beginning of the unencodable characters */
3432 /* const Py_UNICODE *badp = NULL; */
3433 /* pointer into the output */
3434 char *str;
3435 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003436 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003437 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3438 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439 PyObject *errorHandler = NULL;
3440 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003441 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003442 /* the following variable is used for caching string comparisons
3443 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3444 int known_errorHandler = -1;
3445
3446 /* allocate enough for a simple encoding without
3447 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003448 if (size == 0)
3449 return PyString_FromStringAndSize(NULL, 0);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003450 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003451 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003452 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003453 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454 ressize = size;
3455
3456 while (p<endp) {
3457 Py_UNICODE c = *p;
3458
3459 /* can we encode this? */
3460 if (c<limit) {
3461 /* no overflow check, because we know that the space is enough */
3462 *str++ = (char)c;
3463 ++p;
3464 }
3465 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003466 Py_ssize_t unicodepos = p-startp;
3467 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003468 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003469 Py_ssize_t repsize;
3470 Py_ssize_t newpos;
3471 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472 Py_UNICODE *uni2;
3473 /* startpos for collecting unencodable chars */
3474 const Py_UNICODE *collstart = p;
3475 const Py_UNICODE *collend = p;
3476 /* find all unecodable characters */
3477 while ((collend < endp) && ((*collend)>=limit))
3478 ++collend;
3479 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3480 if (known_errorHandler==-1) {
3481 if ((errors==NULL) || (!strcmp(errors, "strict")))
3482 known_errorHandler = 1;
3483 else if (!strcmp(errors, "replace"))
3484 known_errorHandler = 2;
3485 else if (!strcmp(errors, "ignore"))
3486 known_errorHandler = 3;
3487 else if (!strcmp(errors, "xmlcharrefreplace"))
3488 known_errorHandler = 4;
3489 else
3490 known_errorHandler = 0;
3491 }
3492 switch (known_errorHandler) {
3493 case 1: /* strict */
3494 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3495 goto onError;
3496 case 2: /* replace */
3497 while (collstart++<collend)
3498 *str++ = '?'; /* fall through */
3499 case 3: /* ignore */
3500 p = collend;
3501 break;
3502 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003503 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504 /* determine replacement size (temporarily (mis)uses p) */
3505 for (p = collstart, repsize = 0; p < collend; ++p) {
3506 if (*p<10)
3507 repsize += 2+1+1;
3508 else if (*p<100)
3509 repsize += 2+2+1;
3510 else if (*p<1000)
3511 repsize += 2+3+1;
3512 else if (*p<10000)
3513 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003514#ifndef Py_UNICODE_WIDE
3515 else
3516 repsize += 2+5+1;
3517#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 else if (*p<100000)
3519 repsize += 2+5+1;
3520 else if (*p<1000000)
3521 repsize += 2+6+1;
3522 else
3523 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003524#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 }
3526 requiredsize = respos+repsize+(endp-collend);
3527 if (requiredsize > ressize) {
3528 if (requiredsize<2*ressize)
3529 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003530 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003532 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 ressize = requiredsize;
3534 }
3535 /* generate replacement (temporarily (mis)uses p) */
3536 for (p = collstart; p < collend; ++p) {
3537 str += sprintf(str, "&#%d;", (int)*p);
3538 }
3539 p = collend;
3540 break;
3541 default:
3542 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3543 encoding, reason, startp, size, &exc,
3544 collstart-startp, collend-startp, &newpos);
3545 if (repunicode == NULL)
3546 goto onError;
3547 /* need more space? (at least enough for what we
3548 have+the replacement+the rest of the string, so
3549 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003550 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 repsize = PyUnicode_GET_SIZE(repunicode);
3552 requiredsize = respos+repsize+(endp-collend);
3553 if (requiredsize > ressize) {
3554 if (requiredsize<2*ressize)
3555 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003556 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 Py_DECREF(repunicode);
3558 goto onError;
3559 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003560 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 ressize = requiredsize;
3562 }
3563 /* check if there is anything unencodable in the replacement
3564 and copy it to the output */
3565 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3566 c = *uni2;
3567 if (c >= limit) {
3568 raise_encode_exception(&exc, encoding, startp, size,
3569 unicodepos, unicodepos+1, reason);
3570 Py_DECREF(repunicode);
3571 goto onError;
3572 }
3573 *str = (char)c;
3574 }
3575 p = startp + newpos;
3576 Py_DECREF(repunicode);
3577 }
3578 }
3579 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003580 result = PyString_FromStringAndSize(PyBytes_AS_STRING(res),
3581 str - PyBytes_AS_STRING(res));
3582 onError:
3583 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 Py_XDECREF(errorHandler);
3585 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003586 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587}
3588
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003590 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591 const char *errors)
3592{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594}
3595
3596PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3597{
3598 if (!PyUnicode_Check(unicode)) {
3599 PyErr_BadArgument();
3600 return NULL;
3601 }
3602 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3603 PyUnicode_GET_SIZE(unicode),
3604 NULL);
3605}
3606
3607/* --- 7-bit ASCII Codec -------------------------------------------------- */
3608
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003610 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 const char *errors)
3612{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 PyUnicodeObject *v;
3615 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003616 Py_ssize_t startinpos;
3617 Py_ssize_t endinpos;
3618 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619 const char *e;
3620 PyObject *errorHandler = NULL;
3621 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003622
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003624 if (size == 1 && *(unsigned char*)s < 128) {
3625 Py_UNICODE r = *(unsigned char*)s;
3626 return PyUnicode_FromUnicode(&r, 1);
3627 }
Tim Petersced69f82003-09-16 20:30:58 +00003628
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 v = _PyUnicode_New(size);
3630 if (v == NULL)
3631 goto onError;
3632 if (size == 0)
3633 return (PyObject *)v;
3634 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 e = s + size;
3636 while (s < e) {
3637 register unsigned char c = (unsigned char)*s;
3638 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 ++s;
3641 }
3642 else {
3643 startinpos = s-starts;
3644 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003645 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646 if (unicode_decode_call_errorhandler(
3647 errors, &errorHandler,
3648 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003649 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003652 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003654 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003655 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003656 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657 Py_XDECREF(errorHandler);
3658 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003660
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 onError:
3662 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663 Py_XDECREF(errorHandler);
3664 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665 return NULL;
3666}
3667
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003669 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670 const char *errors)
3671{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003672 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673}
3674
3675PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3676{
3677 if (!PyUnicode_Check(unicode)) {
3678 PyErr_BadArgument();
3679 return NULL;
3680 }
3681 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3682 PyUnicode_GET_SIZE(unicode),
3683 NULL);
3684}
3685
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003686#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003687
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003688/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003689
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003690#if SIZEOF_INT < SIZEOF_SSIZE_T
3691#define NEED_RETRY
3692#endif
3693
3694/* XXX This code is limited to "true" double-byte encodings, as
3695 a) it assumes an incomplete character consists of a single byte, and
3696 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3697 encodings, see IsDBCSLeadByteEx documentation. */
3698
3699static int is_dbcs_lead_byte(const char *s, int offset)
3700{
3701 const char *curr = s + offset;
3702
3703 if (IsDBCSLeadByte(*curr)) {
3704 const char *prev = CharPrev(s, curr);
3705 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3706 }
3707 return 0;
3708}
3709
3710/*
3711 * Decode MBCS string into unicode object. If 'final' is set, converts
3712 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3713 */
3714static int decode_mbcs(PyUnicodeObject **v,
3715 const char *s, /* MBCS string */
3716 int size, /* sizeof MBCS string */
3717 int final)
3718{
3719 Py_UNICODE *p;
3720 Py_ssize_t n = 0;
3721 int usize = 0;
3722
3723 assert(size >= 0);
3724
3725 /* Skip trailing lead-byte unless 'final' is set */
3726 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3727 --size;
3728
3729 /* First get the size of the result */
3730 if (size > 0) {
3731 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3732 if (usize == 0) {
3733 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3734 return -1;
3735 }
3736 }
3737
3738 if (*v == NULL) {
3739 /* Create unicode object */
3740 *v = _PyUnicode_New(usize);
3741 if (*v == NULL)
3742 return -1;
3743 }
3744 else {
3745 /* Extend unicode object */
3746 n = PyUnicode_GET_SIZE(*v);
3747 if (_PyUnicode_Resize(v, n + usize) < 0)
3748 return -1;
3749 }
3750
3751 /* Do the conversion */
3752 if (size > 0) {
3753 p = PyUnicode_AS_UNICODE(*v) + n;
3754 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3755 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3756 return -1;
3757 }
3758 }
3759
3760 return size;
3761}
3762
3763PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3764 Py_ssize_t size,
3765 const char *errors,
3766 Py_ssize_t *consumed)
3767{
3768 PyUnicodeObject *v = NULL;
3769 int done;
3770
3771 if (consumed)
3772 *consumed = 0;
3773
3774#ifdef NEED_RETRY
3775 retry:
3776 if (size > INT_MAX)
3777 done = decode_mbcs(&v, s, INT_MAX, 0);
3778 else
3779#endif
3780 done = decode_mbcs(&v, s, (int)size, !consumed);
3781
3782 if (done < 0) {
3783 Py_XDECREF(v);
3784 return NULL;
3785 }
3786
3787 if (consumed)
3788 *consumed += done;
3789
3790#ifdef NEED_RETRY
3791 if (size > INT_MAX) {
3792 s += done;
3793 size -= done;
3794 goto retry;
3795 }
3796#endif
3797
3798 return (PyObject *)v;
3799}
3800
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003801PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003802 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003803 const char *errors)
3804{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003805 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3806}
3807
3808/*
3809 * Convert unicode into string object (MBCS).
3810 * Returns 0 if succeed, -1 otherwise.
3811 */
3812static int encode_mbcs(PyObject **repr,
3813 const Py_UNICODE *p, /* unicode */
3814 int size) /* size of unicode */
3815{
3816 int mbcssize = 0;
3817 Py_ssize_t n = 0;
3818
3819 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003820
3821 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003822 if (size > 0) {
3823 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3824 if (mbcssize == 0) {
3825 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3826 return -1;
3827 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003828 }
3829
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003830 if (*repr == NULL) {
3831 /* Create string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003832 *repr = PyString_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003833 if (*repr == NULL)
3834 return -1;
3835 }
3836 else {
3837 /* Extend string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003838 n = PyString_Size(*repr);
3839 if (_PyString_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003840 return -1;
3841 }
3842
3843 /* Do the conversion */
3844 if (size > 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00003845 char *s = PyString_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003846 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3847 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3848 return -1;
3849 }
3850 }
3851
3852 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003853}
3854
3855PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003856 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003857 const char *errors)
3858{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003859 PyObject *repr = NULL;
3860 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003861
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003862#ifdef NEED_RETRY
3863 retry:
3864 if (size > INT_MAX)
3865 ret = encode_mbcs(&repr, p, INT_MAX);
3866 else
3867#endif
3868 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003869
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003870 if (ret < 0) {
3871 Py_XDECREF(repr);
3872 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003873 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003874
3875#ifdef NEED_RETRY
3876 if (size > INT_MAX) {
3877 p += INT_MAX;
3878 size -= INT_MAX;
3879 goto retry;
3880 }
3881#endif
3882
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003883 return repr;
3884}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003885
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003886PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3887{
3888 if (!PyUnicode_Check(unicode)) {
3889 PyErr_BadArgument();
3890 return NULL;
3891 }
3892 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3893 PyUnicode_GET_SIZE(unicode),
3894 NULL);
3895}
3896
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003897#undef NEED_RETRY
3898
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003899#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003900
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901/* --- Character Mapping Codec -------------------------------------------- */
3902
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003904 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905 PyObject *mapping,
3906 const char *errors)
3907{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003909 Py_ssize_t startinpos;
3910 Py_ssize_t endinpos;
3911 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913 PyUnicodeObject *v;
3914 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003915 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003916 PyObject *errorHandler = NULL;
3917 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003918 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003919 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003920
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 /* Default to Latin-1 */
3922 if (mapping == NULL)
3923 return PyUnicode_DecodeLatin1(s, size, errors);
3924
3925 v = _PyUnicode_New(size);
3926 if (v == NULL)
3927 goto onError;
3928 if (size == 0)
3929 return (PyObject *)v;
3930 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003932 if (PyUnicode_CheckExact(mapping)) {
3933 mapstring = PyUnicode_AS_UNICODE(mapping);
3934 maplen = PyUnicode_GET_SIZE(mapping);
3935 while (s < e) {
3936 unsigned char ch = *s;
3937 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003939 if (ch < maplen)
3940 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003942 if (x == 0xfffe) {
3943 /* undefined mapping */
3944 outpos = p-PyUnicode_AS_UNICODE(v);
3945 startinpos = s-starts;
3946 endinpos = startinpos+1;
3947 if (unicode_decode_call_errorhandler(
3948 errors, &errorHandler,
3949 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003950 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003951 (PyObject **)&v, &outpos, &p)) {
3952 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003953 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003954 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003955 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003956 *p++ = x;
3957 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003959 }
3960 else {
3961 while (s < e) {
3962 unsigned char ch = *s;
3963 PyObject *w, *x;
3964
3965 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3966 w = PyInt_FromLong((long)ch);
3967 if (w == NULL)
3968 goto onError;
3969 x = PyObject_GetItem(mapping, w);
3970 Py_DECREF(w);
3971 if (x == NULL) {
3972 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3973 /* No mapping found means: mapping is undefined. */
3974 PyErr_Clear();
3975 x = Py_None;
3976 Py_INCREF(x);
3977 } else
3978 goto onError;
3979 }
3980
3981 /* Apply mapping */
3982 if (PyInt_Check(x)) {
3983 long value = PyInt_AS_LONG(x);
3984 if (value < 0 || value > 65535) {
3985 PyErr_SetString(PyExc_TypeError,
3986 "character mapping must be in range(65536)");
3987 Py_DECREF(x);
3988 goto onError;
3989 }
3990 *p++ = (Py_UNICODE)value;
3991 }
3992 else if (x == Py_None) {
3993 /* undefined mapping */
3994 outpos = p-PyUnicode_AS_UNICODE(v);
3995 startinpos = s-starts;
3996 endinpos = startinpos+1;
3997 if (unicode_decode_call_errorhandler(
3998 errors, &errorHandler,
3999 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004000 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004001 (PyObject **)&v, &outpos, &p)) {
4002 Py_DECREF(x);
4003 goto onError;
4004 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004005 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004006 continue;
4007 }
4008 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004009 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004010
4011 if (targetsize == 1)
4012 /* 1-1 mapping */
4013 *p++ = *PyUnicode_AS_UNICODE(x);
4014
4015 else if (targetsize > 1) {
4016 /* 1-n mapping */
4017 if (targetsize > extrachars) {
4018 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004019 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4020 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004021 (targetsize << 2);
4022 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004023 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004024 if (_PyUnicode_Resize(&v,
4025 PyUnicode_GET_SIZE(v) + needed) < 0) {
4026 Py_DECREF(x);
4027 goto onError;
4028 }
4029 p = PyUnicode_AS_UNICODE(v) + oldpos;
4030 }
4031 Py_UNICODE_COPY(p,
4032 PyUnicode_AS_UNICODE(x),
4033 targetsize);
4034 p += targetsize;
4035 extrachars -= targetsize;
4036 }
4037 /* 1-0 mapping: skip the character */
4038 }
4039 else {
4040 /* wrong return value */
4041 PyErr_SetString(PyExc_TypeError,
4042 "character mapping must return integer, None or unicode");
4043 Py_DECREF(x);
4044 goto onError;
4045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004047 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049 }
4050 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004051 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053 Py_XDECREF(errorHandler);
4054 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004056
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 Py_XDECREF(errorHandler);
4059 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 Py_XDECREF(v);
4061 return NULL;
4062}
4063
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004064/* Charmap encoding: the lookup table */
4065
4066struct encoding_map{
4067 PyObject_HEAD
4068 unsigned char level1[32];
4069 int count2, count3;
4070 unsigned char level23[1];
4071};
4072
4073static PyObject*
4074encoding_map_size(PyObject *obj, PyObject* args)
4075{
4076 struct encoding_map *map = (struct encoding_map*)obj;
4077 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4078 128*map->count3);
4079}
4080
4081static PyMethodDef encoding_map_methods[] = {
4082 {"size", encoding_map_size, METH_NOARGS,
4083 PyDoc_STR("Return the size (in bytes) of this object") },
4084 { 0 }
4085};
4086
4087static void
4088encoding_map_dealloc(PyObject* o)
4089{
4090 PyObject_FREE(o);
4091}
4092
4093static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004094 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004095 "EncodingMap", /*tp_name*/
4096 sizeof(struct encoding_map), /*tp_basicsize*/
4097 0, /*tp_itemsize*/
4098 /* methods */
4099 encoding_map_dealloc, /*tp_dealloc*/
4100 0, /*tp_print*/
4101 0, /*tp_getattr*/
4102 0, /*tp_setattr*/
4103 0, /*tp_compare*/
4104 0, /*tp_repr*/
4105 0, /*tp_as_number*/
4106 0, /*tp_as_sequence*/
4107 0, /*tp_as_mapping*/
4108 0, /*tp_hash*/
4109 0, /*tp_call*/
4110 0, /*tp_str*/
4111 0, /*tp_getattro*/
4112 0, /*tp_setattro*/
4113 0, /*tp_as_buffer*/
4114 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4115 0, /*tp_doc*/
4116 0, /*tp_traverse*/
4117 0, /*tp_clear*/
4118 0, /*tp_richcompare*/
4119 0, /*tp_weaklistoffset*/
4120 0, /*tp_iter*/
4121 0, /*tp_iternext*/
4122 encoding_map_methods, /*tp_methods*/
4123 0, /*tp_members*/
4124 0, /*tp_getset*/
4125 0, /*tp_base*/
4126 0, /*tp_dict*/
4127 0, /*tp_descr_get*/
4128 0, /*tp_descr_set*/
4129 0, /*tp_dictoffset*/
4130 0, /*tp_init*/
4131 0, /*tp_alloc*/
4132 0, /*tp_new*/
4133 0, /*tp_free*/
4134 0, /*tp_is_gc*/
4135};
4136
4137PyObject*
4138PyUnicode_BuildEncodingMap(PyObject* string)
4139{
4140 Py_UNICODE *decode;
4141 PyObject *result;
4142 struct encoding_map *mresult;
4143 int i;
4144 int need_dict = 0;
4145 unsigned char level1[32];
4146 unsigned char level2[512];
4147 unsigned char *mlevel1, *mlevel2, *mlevel3;
4148 int count2 = 0, count3 = 0;
4149
4150 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4151 PyErr_BadArgument();
4152 return NULL;
4153 }
4154 decode = PyUnicode_AS_UNICODE(string);
4155 memset(level1, 0xFF, sizeof level1);
4156 memset(level2, 0xFF, sizeof level2);
4157
4158 /* If there isn't a one-to-one mapping of NULL to \0,
4159 or if there are non-BMP characters, we need to use
4160 a mapping dictionary. */
4161 if (decode[0] != 0)
4162 need_dict = 1;
4163 for (i = 1; i < 256; i++) {
4164 int l1, l2;
4165 if (decode[i] == 0
4166 #ifdef Py_UNICODE_WIDE
4167 || decode[i] > 0xFFFF
4168 #endif
4169 ) {
4170 need_dict = 1;
4171 break;
4172 }
4173 if (decode[i] == 0xFFFE)
4174 /* unmapped character */
4175 continue;
4176 l1 = decode[i] >> 11;
4177 l2 = decode[i] >> 7;
4178 if (level1[l1] == 0xFF)
4179 level1[l1] = count2++;
4180 if (level2[l2] == 0xFF)
4181 level2[l2] = count3++;
4182 }
4183
4184 if (count2 >= 0xFF || count3 >= 0xFF)
4185 need_dict = 1;
4186
4187 if (need_dict) {
4188 PyObject *result = PyDict_New();
4189 PyObject *key, *value;
4190 if (!result)
4191 return NULL;
4192 for (i = 0; i < 256; i++) {
4193 key = value = NULL;
4194 key = PyInt_FromLong(decode[i]);
4195 value = PyInt_FromLong(i);
4196 if (!key || !value)
4197 goto failed1;
4198 if (PyDict_SetItem(result, key, value) == -1)
4199 goto failed1;
4200 Py_DECREF(key);
4201 Py_DECREF(value);
4202 }
4203 return result;
4204 failed1:
4205 Py_XDECREF(key);
4206 Py_XDECREF(value);
4207 Py_DECREF(result);
4208 return NULL;
4209 }
4210
4211 /* Create a three-level trie */
4212 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4213 16*count2 + 128*count3 - 1);
4214 if (!result)
4215 return PyErr_NoMemory();
4216 PyObject_Init(result, &EncodingMapType);
4217 mresult = (struct encoding_map*)result;
4218 mresult->count2 = count2;
4219 mresult->count3 = count3;
4220 mlevel1 = mresult->level1;
4221 mlevel2 = mresult->level23;
4222 mlevel3 = mresult->level23 + 16*count2;
4223 memcpy(mlevel1, level1, 32);
4224 memset(mlevel2, 0xFF, 16*count2);
4225 memset(mlevel3, 0, 128*count3);
4226 count3 = 0;
4227 for (i = 1; i < 256; i++) {
4228 int o1, o2, o3, i2, i3;
4229 if (decode[i] == 0xFFFE)
4230 /* unmapped character */
4231 continue;
4232 o1 = decode[i]>>11;
4233 o2 = (decode[i]>>7) & 0xF;
4234 i2 = 16*mlevel1[o1] + o2;
4235 if (mlevel2[i2] == 0xFF)
4236 mlevel2[i2] = count3++;
4237 o3 = decode[i] & 0x7F;
4238 i3 = 128*mlevel2[i2] + o3;
4239 mlevel3[i3] = i;
4240 }
4241 return result;
4242}
4243
4244static int
4245encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4246{
4247 struct encoding_map *map = (struct encoding_map*)mapping;
4248 int l1 = c>>11;
4249 int l2 = (c>>7) & 0xF;
4250 int l3 = c & 0x7F;
4251 int i;
4252
4253#ifdef Py_UNICODE_WIDE
4254 if (c > 0xFFFF) {
4255 return -1;
4256 }
4257#endif
4258 if (c == 0)
4259 return 0;
4260 /* level 1*/
4261 i = map->level1[l1];
4262 if (i == 0xFF) {
4263 return -1;
4264 }
4265 /* level 2*/
4266 i = map->level23[16*i+l2];
4267 if (i == 0xFF) {
4268 return -1;
4269 }
4270 /* level 3 */
4271 i = map->level23[16*map->count2 + 128*i + l3];
4272 if (i == 0) {
4273 return -1;
4274 }
4275 return i;
4276}
4277
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278/* Lookup the character ch in the mapping. If the character
4279 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004280 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 PyObject *w = PyInt_FromLong((long)c);
4284 PyObject *x;
4285
4286 if (w == NULL)
4287 return NULL;
4288 x = PyObject_GetItem(mapping, w);
4289 Py_DECREF(w);
4290 if (x == NULL) {
4291 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4292 /* No mapping found means: mapping is undefined. */
4293 PyErr_Clear();
4294 x = Py_None;
4295 Py_INCREF(x);
4296 return x;
4297 } else
4298 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004300 else if (x == Py_None)
4301 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 else if (PyInt_Check(x)) {
4303 long value = PyInt_AS_LONG(x);
4304 if (value < 0 || value > 255) {
4305 PyErr_SetString(PyExc_TypeError,
4306 "character mapping must be in range(256)");
4307 Py_DECREF(x);
4308 return NULL;
4309 }
4310 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312 else if (PyString_Check(x))
4313 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004316 PyErr_Format(PyExc_TypeError,
4317 "character mapping must return integer, None or str8, not %.400s",
4318 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 Py_DECREF(x);
4320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 }
4322}
4323
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004324static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004325charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004326{
Guido van Rossum98297ee2007-11-06 21:34:58 +00004327 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004328 /* exponentially overallocate to minimize reallocations */
4329 if (requiredsize < 2*outsize)
4330 requiredsize = 2*outsize;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004331 if (_PyString_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004332 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004333 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004334}
4335
4336typedef enum charmapencode_result {
4337 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4338}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004339/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004340 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004341 space is available. Return a new reference to the object that
4342 was put in the output buffer, or Py_None, if the mapping was undefined
4343 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004344 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004345static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004346charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004347 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004349 PyObject *rep;
4350 char *outstart;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004351 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004352
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004353 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004354 int res = encoding_map_lookup(c, mapping);
4355 Py_ssize_t requiredsize = *outpos+1;
4356 if (res == -1)
4357 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004358 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004359 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004360 return enc_EXCEPTION;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004361 outstart = PyString_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004362 outstart[(*outpos)++] = (char)res;
4363 return enc_SUCCESS;
4364 }
4365
4366 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004367 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004368 return enc_EXCEPTION;
4369 else if (rep==Py_None) {
4370 Py_DECREF(rep);
4371 return enc_FAILED;
4372 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004373 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004374 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004375 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004376 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004377 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004378 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004380 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4382 }
4383 else {
4384 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004385 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4386 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004387 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004388 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004390 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004392 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 memcpy(outstart + *outpos, repchars, repsize);
4394 *outpos += repsize;
4395 }
4396 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004397 Py_DECREF(rep);
4398 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399}
4400
4401/* handle an error in PyUnicode_EncodeCharmap
4402 Return 0 on success, -1 on error */
4403static
4404int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004405 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004407 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004408 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409{
4410 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004411 Py_ssize_t repsize;
4412 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 Py_UNICODE *uni2;
4414 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004415 Py_ssize_t collstartpos = *inpos;
4416 Py_ssize_t collendpos = *inpos+1;
4417 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 char *encoding = "charmap";
4419 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004420 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422 /* find all unencodable characters */
4423 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004424 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004425 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004426 int res = encoding_map_lookup(p[collendpos], mapping);
4427 if (res != -1)
4428 break;
4429 ++collendpos;
4430 continue;
4431 }
4432
4433 rep = charmapencode_lookup(p[collendpos], mapping);
4434 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004436 else if (rep!=Py_None) {
4437 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 break;
4439 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004440 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 ++collendpos;
4442 }
4443 /* cache callback name lookup
4444 * (if not done yet, i.e. it's the first error) */
4445 if (*known_errorHandler==-1) {
4446 if ((errors==NULL) || (!strcmp(errors, "strict")))
4447 *known_errorHandler = 1;
4448 else if (!strcmp(errors, "replace"))
4449 *known_errorHandler = 2;
4450 else if (!strcmp(errors, "ignore"))
4451 *known_errorHandler = 3;
4452 else if (!strcmp(errors, "xmlcharrefreplace"))
4453 *known_errorHandler = 4;
4454 else
4455 *known_errorHandler = 0;
4456 }
4457 switch (*known_errorHandler) {
4458 case 1: /* strict */
4459 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4460 return -1;
4461 case 2: /* replace */
4462 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4463 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004464 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 return -1;
4466 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004467 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4469 return -1;
4470 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471 }
4472 /* fall through */
4473 case 3: /* ignore */
4474 *inpos = collendpos;
4475 break;
4476 case 4: /* xmlcharrefreplace */
4477 /* generate replacement (temporarily (mis)uses p) */
4478 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4479 char buffer[2+29+1+1];
4480 char *cp;
4481 sprintf(buffer, "&#%d;", (int)p[collpos]);
4482 for (cp = buffer; *cp; ++cp) {
4483 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004484 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004486 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4488 return -1;
4489 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004490 }
4491 }
4492 *inpos = collendpos;
4493 break;
4494 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004495 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004496 encoding, reason, p, size, exceptionObject,
4497 collstartpos, collendpos, &newpos);
4498 if (repunicode == NULL)
4499 return -1;
4500 /* generate replacement */
4501 repsize = PyUnicode_GET_SIZE(repunicode);
4502 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4503 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004504 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 return -1;
4506 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004507 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4510 return -1;
4511 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 }
4513 *inpos = newpos;
4514 Py_DECREF(repunicode);
4515 }
4516 return 0;
4517}
4518
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004520 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521 PyObject *mapping,
4522 const char *errors)
4523{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 /* output object */
4525 PyObject *res = NULL;
4526 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004527 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004529 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 PyObject *errorHandler = NULL;
4531 PyObject *exc = NULL;
4532 /* the following variable is used for caching string comparisons
4533 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4534 * 3=ignore, 4=xmlcharrefreplace */
4535 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536
4537 /* Default to Latin-1 */
4538 if (mapping == NULL)
4539 return PyUnicode_EncodeLatin1(p, size, errors);
4540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 /* allocate enough for a simple encoding without
4542 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004543 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 if (res == NULL)
4545 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004546 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 while (inpos<size) {
4550 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004551 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004552 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004554 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 if (charmap_encoding_error(p, size, &inpos, mapping,
4556 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004557 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004558 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004559 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 else
4563 /* done with this character => adjust input position */
4564 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004566
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567 /* Resize if we allocated to much */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004568 if (respos<PyString_GET_SIZE(res))
4569 _PyString_Resize(&res, respos);
4570
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 Py_XDECREF(exc);
4572 Py_XDECREF(errorHandler);
4573 return res;
4574
4575 onError:
4576 Py_XDECREF(res);
4577 Py_XDECREF(exc);
4578 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 return NULL;
4580}
4581
4582PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4583 PyObject *mapping)
4584{
4585 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4586 PyErr_BadArgument();
4587 return NULL;
4588 }
4589 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4590 PyUnicode_GET_SIZE(unicode),
4591 mapping,
4592 NULL);
4593}
4594
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004595/* create or adjust a UnicodeTranslateError */
4596static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004597 const Py_UNICODE *unicode, Py_ssize_t size,
4598 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004601 if (*exceptionObject == NULL) {
4602 *exceptionObject = PyUnicodeTranslateError_Create(
4603 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 }
4605 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4607 goto onError;
4608 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4609 goto onError;
4610 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4611 goto onError;
4612 return;
4613 onError:
4614 Py_DECREF(*exceptionObject);
4615 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616 }
4617}
4618
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619/* raises a UnicodeTranslateError */
4620static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004621 const Py_UNICODE *unicode, Py_ssize_t size,
4622 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 const char *reason)
4624{
4625 make_translate_exception(exceptionObject,
4626 unicode, size, startpos, endpos, reason);
4627 if (*exceptionObject != NULL)
4628 PyCodec_StrictErrors(*exceptionObject);
4629}
4630
4631/* error handling callback helper:
4632 build arguments, call the callback and check the arguments,
4633 put the result into newpos and return the replacement string, which
4634 has to be freed by the caller */
4635static PyObject *unicode_translate_call_errorhandler(const char *errors,
4636 PyObject **errorHandler,
4637 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004638 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4639 Py_ssize_t startpos, Py_ssize_t endpos,
4640 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004642 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004644 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 PyObject *restuple;
4646 PyObject *resunicode;
4647
4648 if (*errorHandler == NULL) {
4649 *errorHandler = PyCodec_LookupError(errors);
4650 if (*errorHandler == NULL)
4651 return NULL;
4652 }
4653
4654 make_translate_exception(exceptionObject,
4655 unicode, size, startpos, endpos, reason);
4656 if (*exceptionObject == NULL)
4657 return NULL;
4658
4659 restuple = PyObject_CallFunctionObjArgs(
4660 *errorHandler, *exceptionObject, NULL);
4661 if (restuple == NULL)
4662 return NULL;
4663 if (!PyTuple_Check(restuple)) {
4664 PyErr_Format(PyExc_TypeError, &argparse[4]);
4665 Py_DECREF(restuple);
4666 return NULL;
4667 }
4668 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004669 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 Py_DECREF(restuple);
4671 return NULL;
4672 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004673 if (i_newpos<0)
4674 *newpos = size+i_newpos;
4675 else
4676 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004677 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004678 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004679 Py_DECREF(restuple);
4680 return NULL;
4681 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 Py_INCREF(resunicode);
4683 Py_DECREF(restuple);
4684 return resunicode;
4685}
4686
4687/* Lookup the character ch in the mapping and put the result in result,
4688 which must be decrefed by the caller.
4689 Return 0 on success, -1 on error */
4690static
4691int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4692{
4693 PyObject *w = PyInt_FromLong((long)c);
4694 PyObject *x;
4695
4696 if (w == NULL)
4697 return -1;
4698 x = PyObject_GetItem(mapping, w);
4699 Py_DECREF(w);
4700 if (x == NULL) {
4701 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4702 /* No mapping found means: use 1:1 mapping. */
4703 PyErr_Clear();
4704 *result = NULL;
4705 return 0;
4706 } else
4707 return -1;
4708 }
4709 else if (x == Py_None) {
4710 *result = x;
4711 return 0;
4712 }
4713 else if (PyInt_Check(x)) {
4714 long value = PyInt_AS_LONG(x);
4715 long max = PyUnicode_GetMax();
4716 if (value < 0 || value > max) {
4717 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004718 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719 Py_DECREF(x);
4720 return -1;
4721 }
4722 *result = x;
4723 return 0;
4724 }
4725 else if (PyUnicode_Check(x)) {
4726 *result = x;
4727 return 0;
4728 }
4729 else {
4730 /* wrong return value */
4731 PyErr_SetString(PyExc_TypeError,
4732 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004733 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004734 return -1;
4735 }
4736}
4737/* ensure that *outobj is at least requiredsize characters long,
4738if not reallocate and adjust various state variables.
4739Return 0 on success, -1 on error */
4740static
Walter Dörwald4894c302003-10-24 14:25:28 +00004741int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004742 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004744 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004745 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004746 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004747 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004749 if (requiredsize < 2 * oldsize)
4750 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004751 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004752 return -1;
4753 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 }
4755 return 0;
4756}
4757/* lookup the character, put the result in the output string and adjust
4758 various state variables. Return a new reference to the object that
4759 was put in the output buffer in *result, or Py_None, if the mapping was
4760 undefined (in which case no character was written).
4761 The called must decref result.
4762 Return 0 on success, -1 on error. */
4763static
Walter Dörwald4894c302003-10-24 14:25:28 +00004764int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004765 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004766 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767{
Walter Dörwald4894c302003-10-24 14:25:28 +00004768 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 return -1;
4770 if (*res==NULL) {
4771 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004772 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773 }
4774 else if (*res==Py_None)
4775 ;
4776 else if (PyInt_Check(*res)) {
4777 /* no overflow check, because we know that the space is enough */
4778 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4779 }
4780 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004781 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 if (repsize==1) {
4783 /* no overflow check, because we know that the space is enough */
4784 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4785 }
4786 else if (repsize!=0) {
4787 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004788 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004789 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004790 repsize - 1;
4791 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792 return -1;
4793 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4794 *outp += repsize;
4795 }
4796 }
4797 else
4798 return -1;
4799 return 0;
4800}
4801
4802PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004803 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804 PyObject *mapping,
4805 const char *errors)
4806{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 /* output object */
4808 PyObject *res = NULL;
4809 /* pointers to the beginning and end+1 of input */
4810 const Py_UNICODE *startp = p;
4811 const Py_UNICODE *endp = p + size;
4812 /* pointer into the output */
4813 Py_UNICODE *str;
4814 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004815 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 char *reason = "character maps to <undefined>";
4817 PyObject *errorHandler = NULL;
4818 PyObject *exc = NULL;
4819 /* the following variable is used for caching string comparisons
4820 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4821 * 3=ignore, 4=xmlcharrefreplace */
4822 int known_errorHandler = -1;
4823
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 if (mapping == NULL) {
4825 PyErr_BadArgument();
4826 return NULL;
4827 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828
4829 /* allocate enough for a simple 1:1 translation without
4830 replacements, if we need more, we'll resize */
4831 res = PyUnicode_FromUnicode(NULL, size);
4832 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004833 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004835 return res;
4836 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004838 while (p<endp) {
4839 /* try to encode it */
4840 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004841 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 goto onError;
4844 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004845 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 if (x!=Py_None) /* it worked => adjust input pointer */
4847 ++p;
4848 else { /* untranslatable character */
4849 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004850 Py_ssize_t repsize;
4851 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004852 Py_UNICODE *uni2;
4853 /* startpos for collecting untranslatable chars */
4854 const Py_UNICODE *collstart = p;
4855 const Py_UNICODE *collend = p+1;
4856 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 /* find all untranslatable characters */
4859 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004860 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 goto onError;
4862 Py_XDECREF(x);
4863 if (x!=Py_None)
4864 break;
4865 ++collend;
4866 }
4867 /* cache callback name lookup
4868 * (if not done yet, i.e. it's the first error) */
4869 if (known_errorHandler==-1) {
4870 if ((errors==NULL) || (!strcmp(errors, "strict")))
4871 known_errorHandler = 1;
4872 else if (!strcmp(errors, "replace"))
4873 known_errorHandler = 2;
4874 else if (!strcmp(errors, "ignore"))
4875 known_errorHandler = 3;
4876 else if (!strcmp(errors, "xmlcharrefreplace"))
4877 known_errorHandler = 4;
4878 else
4879 known_errorHandler = 0;
4880 }
4881 switch (known_errorHandler) {
4882 case 1: /* strict */
4883 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4884 goto onError;
4885 case 2: /* replace */
4886 /* No need to check for space, this is a 1:1 replacement */
4887 for (coll = collstart; coll<collend; ++coll)
4888 *str++ = '?';
4889 /* fall through */
4890 case 3: /* ignore */
4891 p = collend;
4892 break;
4893 case 4: /* xmlcharrefreplace */
4894 /* generate replacement (temporarily (mis)uses p) */
4895 for (p = collstart; p < collend; ++p) {
4896 char buffer[2+29+1+1];
4897 char *cp;
4898 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004899 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4901 goto onError;
4902 for (cp = buffer; *cp; ++cp)
4903 *str++ = *cp;
4904 }
4905 p = collend;
4906 break;
4907 default:
4908 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4909 reason, startp, size, &exc,
4910 collstart-startp, collend-startp, &newpos);
4911 if (repunicode == NULL)
4912 goto onError;
4913 /* generate replacement */
4914 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004915 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004916 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4917 Py_DECREF(repunicode);
4918 goto onError;
4919 }
4920 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4921 *str++ = *uni2;
4922 p = startp + newpos;
4923 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 }
4925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004927 /* Resize if we allocated to much */
4928 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004929 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004930 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004931 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004932 }
4933 Py_XDECREF(exc);
4934 Py_XDECREF(errorHandler);
4935 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004937 onError:
4938 Py_XDECREF(res);
4939 Py_XDECREF(exc);
4940 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 return NULL;
4942}
4943
4944PyObject *PyUnicode_Translate(PyObject *str,
4945 PyObject *mapping,
4946 const char *errors)
4947{
4948 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004949
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950 str = PyUnicode_FromObject(str);
4951 if (str == NULL)
4952 goto onError;
4953 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4954 PyUnicode_GET_SIZE(str),
4955 mapping,
4956 errors);
4957 Py_DECREF(str);
4958 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004959
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960 onError:
4961 Py_XDECREF(str);
4962 return NULL;
4963}
Tim Petersced69f82003-09-16 20:30:58 +00004964
Guido van Rossum9e896b32000-04-05 20:11:21 +00004965/* --- Decimal Encoder ---------------------------------------------------- */
4966
4967int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004968 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004969 char *output,
4970 const char *errors)
4971{
4972 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004973 PyObject *errorHandler = NULL;
4974 PyObject *exc = NULL;
4975 const char *encoding = "decimal";
4976 const char *reason = "invalid decimal Unicode string";
4977 /* the following variable is used for caching string comparisons
4978 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4979 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004980
4981 if (output == NULL) {
4982 PyErr_BadArgument();
4983 return -1;
4984 }
4985
4986 p = s;
4987 end = s + length;
4988 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004989 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004990 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004991 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004992 Py_ssize_t repsize;
4993 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 Py_UNICODE *uni2;
4995 Py_UNICODE *collstart;
4996 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004997
Guido van Rossum9e896b32000-04-05 20:11:21 +00004998 if (Py_UNICODE_ISSPACE(ch)) {
4999 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005000 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005001 continue;
5002 }
5003 decimal = Py_UNICODE_TODECIMAL(ch);
5004 if (decimal >= 0) {
5005 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005006 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005007 continue;
5008 }
Guido van Rossumba477042000-04-06 18:18:10 +00005009 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005010 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005011 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005012 continue;
5013 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005014 /* All other characters are considered unencodable */
5015 collstart = p;
5016 collend = p+1;
5017 while (collend < end) {
5018 if ((0 < *collend && *collend < 256) ||
5019 !Py_UNICODE_ISSPACE(*collend) ||
5020 Py_UNICODE_TODECIMAL(*collend))
5021 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005022 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005023 /* cache callback name lookup
5024 * (if not done yet, i.e. it's the first error) */
5025 if (known_errorHandler==-1) {
5026 if ((errors==NULL) || (!strcmp(errors, "strict")))
5027 known_errorHandler = 1;
5028 else if (!strcmp(errors, "replace"))
5029 known_errorHandler = 2;
5030 else if (!strcmp(errors, "ignore"))
5031 known_errorHandler = 3;
5032 else if (!strcmp(errors, "xmlcharrefreplace"))
5033 known_errorHandler = 4;
5034 else
5035 known_errorHandler = 0;
5036 }
5037 switch (known_errorHandler) {
5038 case 1: /* strict */
5039 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5040 goto onError;
5041 case 2: /* replace */
5042 for (p = collstart; p < collend; ++p)
5043 *output++ = '?';
5044 /* fall through */
5045 case 3: /* ignore */
5046 p = collend;
5047 break;
5048 case 4: /* xmlcharrefreplace */
5049 /* generate replacement (temporarily (mis)uses p) */
5050 for (p = collstart; p < collend; ++p)
5051 output += sprintf(output, "&#%d;", (int)*p);
5052 p = collend;
5053 break;
5054 default:
5055 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5056 encoding, reason, s, length, &exc,
5057 collstart-s, collend-s, &newpos);
5058 if (repunicode == NULL)
5059 goto onError;
5060 /* generate replacement */
5061 repsize = PyUnicode_GET_SIZE(repunicode);
5062 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5063 Py_UNICODE ch = *uni2;
5064 if (Py_UNICODE_ISSPACE(ch))
5065 *output++ = ' ';
5066 else {
5067 decimal = Py_UNICODE_TODECIMAL(ch);
5068 if (decimal >= 0)
5069 *output++ = '0' + decimal;
5070 else if (0 < ch && ch < 256)
5071 *output++ = (char)ch;
5072 else {
5073 Py_DECREF(repunicode);
5074 raise_encode_exception(&exc, encoding,
5075 s, length, collstart-s, collend-s, reason);
5076 goto onError;
5077 }
5078 }
5079 }
5080 p = s + newpos;
5081 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005082 }
5083 }
5084 /* 0-terminate the output string */
5085 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005086 Py_XDECREF(exc);
5087 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005088 return 0;
5089
5090 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005091 Py_XDECREF(exc);
5092 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005093 return -1;
5094}
5095
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096/* --- Helpers ------------------------------------------------------------ */
5097
Eric Smith8c663262007-08-25 02:26:07 +00005098#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005099
5100#include "stringlib/fastsearch.h"
5101
5102#include "stringlib/count.h"
5103#include "stringlib/find.h"
5104#include "stringlib/partition.h"
5105
5106/* helper macro to fixup start/end slice values */
5107#define FIX_START_END(obj) \
5108 if (start < 0) \
5109 start += (obj)->length; \
5110 if (start < 0) \
5111 start = 0; \
5112 if (end > (obj)->length) \
5113 end = (obj)->length; \
5114 if (end < 0) \
5115 end += (obj)->length; \
5116 if (end < 0) \
5117 end = 0;
5118
Martin v. Löwis18e16552006-02-15 17:27:45 +00005119Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005120 PyObject *substr,
5121 Py_ssize_t start,
5122 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005124 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005125 PyUnicodeObject* str_obj;
5126 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005127
Thomas Wouters477c8d52006-05-27 19:21:47 +00005128 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5129 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005131 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5132 if (!sub_obj) {
5133 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 return -1;
5135 }
Tim Petersced69f82003-09-16 20:30:58 +00005136
Thomas Wouters477c8d52006-05-27 19:21:47 +00005137 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005138
Thomas Wouters477c8d52006-05-27 19:21:47 +00005139 result = stringlib_count(
5140 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5141 );
5142
5143 Py_DECREF(sub_obj);
5144 Py_DECREF(str_obj);
5145
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 return result;
5147}
5148
Martin v. Löwis18e16552006-02-15 17:27:45 +00005149Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005150 PyObject *sub,
5151 Py_ssize_t start,
5152 Py_ssize_t end,
5153 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005155 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005156
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005158 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005159 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005160 sub = PyUnicode_FromObject(sub);
5161 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005162 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005163 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 }
Tim Petersced69f82003-09-16 20:30:58 +00005165
Thomas Wouters477c8d52006-05-27 19:21:47 +00005166 if (direction > 0)
5167 result = stringlib_find_slice(
5168 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5169 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5170 start, end
5171 );
5172 else
5173 result = stringlib_rfind_slice(
5174 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5175 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5176 start, end
5177 );
5178
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005180 Py_DECREF(sub);
5181
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 return result;
5183}
5184
Tim Petersced69f82003-09-16 20:30:58 +00005185static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186int tailmatch(PyUnicodeObject *self,
5187 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005188 Py_ssize_t start,
5189 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 int direction)
5191{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192 if (substring->length == 0)
5193 return 1;
5194
Thomas Wouters477c8d52006-05-27 19:21:47 +00005195 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196
5197 end -= substring->length;
5198 if (end < start)
5199 return 0;
5200
5201 if (direction > 0) {
5202 if (Py_UNICODE_MATCH(self, end, substring))
5203 return 1;
5204 } else {
5205 if (Py_UNICODE_MATCH(self, start, substring))
5206 return 1;
5207 }
5208
5209 return 0;
5210}
5211
Martin v. Löwis18e16552006-02-15 17:27:45 +00005212Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005214 Py_ssize_t start,
5215 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 int direction)
5217{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005218 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005219
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 str = PyUnicode_FromObject(str);
5221 if (str == NULL)
5222 return -1;
5223 substr = PyUnicode_FromObject(substr);
5224 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005225 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 return -1;
5227 }
Tim Petersced69f82003-09-16 20:30:58 +00005228
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 result = tailmatch((PyUnicodeObject *)str,
5230 (PyUnicodeObject *)substr,
5231 start, end, direction);
5232 Py_DECREF(str);
5233 Py_DECREF(substr);
5234 return result;
5235}
5236
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237/* Apply fixfct filter to the Unicode object self and return a
5238 reference to the modified object */
5239
Tim Petersced69f82003-09-16 20:30:58 +00005240static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241PyObject *fixup(PyUnicodeObject *self,
5242 int (*fixfct)(PyUnicodeObject *s))
5243{
5244
5245 PyUnicodeObject *u;
5246
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005247 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248 if (u == NULL)
5249 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005250
5251 Py_UNICODE_COPY(u->str, self->str, self->length);
5252
Tim Peters7a29bd52001-09-12 03:03:31 +00005253 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 /* fixfct should return TRUE if it modified the buffer. If
5255 FALSE, return a reference to the original buffer instead
5256 (to save space, not time) */
5257 Py_INCREF(self);
5258 Py_DECREF(u);
5259 return (PyObject*) self;
5260 }
5261 return (PyObject*) u;
5262}
5263
Tim Petersced69f82003-09-16 20:30:58 +00005264static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265int fixupper(PyUnicodeObject *self)
5266{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005267 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 Py_UNICODE *s = self->str;
5269 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005270
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 while (len-- > 0) {
5272 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005273
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 ch = Py_UNICODE_TOUPPER(*s);
5275 if (ch != *s) {
5276 status = 1;
5277 *s = ch;
5278 }
5279 s++;
5280 }
5281
5282 return status;
5283}
5284
Tim Petersced69f82003-09-16 20:30:58 +00005285static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286int fixlower(PyUnicodeObject *self)
5287{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005288 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 Py_UNICODE *s = self->str;
5290 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005291
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 while (len-- > 0) {
5293 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005294
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 ch = Py_UNICODE_TOLOWER(*s);
5296 if (ch != *s) {
5297 status = 1;
5298 *s = ch;
5299 }
5300 s++;
5301 }
5302
5303 return status;
5304}
5305
Tim Petersced69f82003-09-16 20:30:58 +00005306static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307int fixswapcase(PyUnicodeObject *self)
5308{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005309 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310 Py_UNICODE *s = self->str;
5311 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005312
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 while (len-- > 0) {
5314 if (Py_UNICODE_ISUPPER(*s)) {
5315 *s = Py_UNICODE_TOLOWER(*s);
5316 status = 1;
5317 } else if (Py_UNICODE_ISLOWER(*s)) {
5318 *s = Py_UNICODE_TOUPPER(*s);
5319 status = 1;
5320 }
5321 s++;
5322 }
5323
5324 return status;
5325}
5326
Tim Petersced69f82003-09-16 20:30:58 +00005327static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328int fixcapitalize(PyUnicodeObject *self)
5329{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005330 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005331 Py_UNICODE *s = self->str;
5332 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005333
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005334 if (len == 0)
5335 return 0;
5336 if (Py_UNICODE_ISLOWER(*s)) {
5337 *s = Py_UNICODE_TOUPPER(*s);
5338 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005340 s++;
5341 while (--len > 0) {
5342 if (Py_UNICODE_ISUPPER(*s)) {
5343 *s = Py_UNICODE_TOLOWER(*s);
5344 status = 1;
5345 }
5346 s++;
5347 }
5348 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349}
5350
5351static
5352int fixtitle(PyUnicodeObject *self)
5353{
5354 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5355 register Py_UNICODE *e;
5356 int previous_is_cased;
5357
5358 /* Shortcut for single character strings */
5359 if (PyUnicode_GET_SIZE(self) == 1) {
5360 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5361 if (*p != ch) {
5362 *p = ch;
5363 return 1;
5364 }
5365 else
5366 return 0;
5367 }
Tim Petersced69f82003-09-16 20:30:58 +00005368
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 e = p + PyUnicode_GET_SIZE(self);
5370 previous_is_cased = 0;
5371 for (; p < e; p++) {
5372 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005373
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 if (previous_is_cased)
5375 *p = Py_UNICODE_TOLOWER(ch);
5376 else
5377 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005378
5379 if (Py_UNICODE_ISLOWER(ch) ||
5380 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 Py_UNICODE_ISTITLE(ch))
5382 previous_is_cased = 1;
5383 else
5384 previous_is_cased = 0;
5385 }
5386 return 1;
5387}
5388
Tim Peters8ce9f162004-08-27 01:49:32 +00005389PyObject *
5390PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391{
Tim Peters8ce9f162004-08-27 01:49:32 +00005392 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005393 const Py_UNICODE blank = ' ';
5394 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005395 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005396 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005397 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5398 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005399 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5400 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005401 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005402 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005403 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404
Tim Peters05eba1f2004-08-27 21:32:02 +00005405 fseq = PySequence_Fast(seq, "");
5406 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005407 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005408 }
5409
Tim Peters91879ab2004-08-27 22:35:44 +00005410 /* Grrrr. A codec may be invoked to convert str objects to
5411 * Unicode, and so it's possible to call back into Python code
5412 * during PyUnicode_FromObject(), and so it's possible for a sick
5413 * codec to change the size of fseq (if seq is a list). Therefore
5414 * we have to keep refetching the size -- can't assume seqlen
5415 * is invariant.
5416 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005417 seqlen = PySequence_Fast_GET_SIZE(fseq);
5418 /* If empty sequence, return u"". */
5419 if (seqlen == 0) {
5420 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5421 goto Done;
5422 }
5423 /* If singleton sequence with an exact Unicode, return that. */
5424 if (seqlen == 1) {
5425 item = PySequence_Fast_GET_ITEM(fseq, 0);
5426 if (PyUnicode_CheckExact(item)) {
5427 Py_INCREF(item);
5428 res = (PyUnicodeObject *)item;
5429 goto Done;
5430 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005431 }
5432
Tim Peters05eba1f2004-08-27 21:32:02 +00005433 /* At least two items to join, or one that isn't exact Unicode. */
5434 if (seqlen > 1) {
5435 /* Set up sep and seplen -- they're needed. */
5436 if (separator == NULL) {
5437 sep = &blank;
5438 seplen = 1;
5439 }
5440 else {
5441 internal_separator = PyUnicode_FromObject(separator);
5442 if (internal_separator == NULL)
5443 goto onError;
5444 sep = PyUnicode_AS_UNICODE(internal_separator);
5445 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005446 /* In case PyUnicode_FromObject() mutated seq. */
5447 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005448 }
5449 }
5450
5451 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005452 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005453 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005454 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005455 res_p = PyUnicode_AS_UNICODE(res);
5456 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005457
Tim Peters05eba1f2004-08-27 21:32:02 +00005458 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005459 Py_ssize_t itemlen;
5460 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005461
5462 item = PySequence_Fast_GET_ITEM(fseq, i);
5463 /* Convert item to Unicode. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005464 if (!PyUnicode_Check(item)) {
5465 PyErr_Format(PyExc_TypeError,
5466 "sequence item %zd: expected str instance,"
5467 " %.80s found",
5468 i, Py_Type(item)->tp_name);
5469 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005470 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005471 item = PyUnicode_FromObject(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005472 if (item == NULL)
5473 goto onError;
5474 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005475
Tim Peters91879ab2004-08-27 22:35:44 +00005476 /* In case PyUnicode_FromObject() mutated seq. */
5477 seqlen = PySequence_Fast_GET_SIZE(fseq);
5478
Tim Peters8ce9f162004-08-27 01:49:32 +00005479 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005481 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005482 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005483 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005484 if (i < seqlen - 1) {
5485 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005486 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005487 goto Overflow;
5488 }
5489 if (new_res_used > res_alloc) {
5490 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005491 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005492 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005493 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005494 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005495 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005496 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005497 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005499 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005500 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005502
5503 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005504 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005505 res_p += itemlen;
5506 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005507 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005508 res_p += seplen;
5509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005511 res_used = new_res_used;
5512 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005513
Tim Peters05eba1f2004-08-27 21:32:02 +00005514 /* Shrink res to match the used area; this probably can't fail,
5515 * but it's cheap to check.
5516 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005517 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005518 goto onError;
5519
5520 Done:
5521 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005522 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 return (PyObject *)res;
5524
Tim Peters8ce9f162004-08-27 01:49:32 +00005525 Overflow:
5526 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005527 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005528 Py_DECREF(item);
5529 /* fall through */
5530
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005532 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005533 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005534 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535 return NULL;
5536}
5537
Tim Petersced69f82003-09-16 20:30:58 +00005538static
5539PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005540 Py_ssize_t left,
5541 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 Py_UNICODE fill)
5543{
5544 PyUnicodeObject *u;
5545
5546 if (left < 0)
5547 left = 0;
5548 if (right < 0)
5549 right = 0;
5550
Tim Peters7a29bd52001-09-12 03:03:31 +00005551 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 Py_INCREF(self);
5553 return self;
5554 }
5555
5556 u = _PyUnicode_New(left + self->length + right);
5557 if (u) {
5558 if (left)
5559 Py_UNICODE_FILL(u->str, fill, left);
5560 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5561 if (right)
5562 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5563 }
5564
5565 return u;
5566}
5567
5568#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005569 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 if (!str) \
5571 goto onError; \
5572 if (PyList_Append(list, str)) { \
5573 Py_DECREF(str); \
5574 goto onError; \
5575 } \
5576 else \
5577 Py_DECREF(str);
5578
5579static
5580PyObject *split_whitespace(PyUnicodeObject *self,
5581 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005582 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005584 register Py_ssize_t i;
5585 register Py_ssize_t j;
5586 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 PyObject *str;
5588
5589 for (i = j = 0; i < len; ) {
5590 /* find a token */
5591 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5592 i++;
5593 j = i;
5594 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5595 i++;
5596 if (j < i) {
5597 if (maxcount-- <= 0)
5598 break;
5599 SPLIT_APPEND(self->str, j, i);
5600 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5601 i++;
5602 j = i;
5603 }
5604 }
5605 if (j < len) {
5606 SPLIT_APPEND(self->str, j, len);
5607 }
5608 return list;
5609
5610 onError:
5611 Py_DECREF(list);
5612 return NULL;
5613}
5614
5615PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005616 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005618 register Py_ssize_t i;
5619 register Py_ssize_t j;
5620 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 PyObject *list;
5622 PyObject *str;
5623 Py_UNICODE *data;
5624
5625 string = PyUnicode_FromObject(string);
5626 if (string == NULL)
5627 return NULL;
5628 data = PyUnicode_AS_UNICODE(string);
5629 len = PyUnicode_GET_SIZE(string);
5630
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 list = PyList_New(0);
5632 if (!list)
5633 goto onError;
5634
5635 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005636 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005637
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005639 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641
5642 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005643 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 if (i < len) {
5645 if (data[i] == '\r' && i + 1 < len &&
5646 data[i+1] == '\n')
5647 i += 2;
5648 else
5649 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005650 if (keepends)
5651 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 }
Guido van Rossum86662912000-04-11 15:38:46 +00005653 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 j = i;
5655 }
5656 if (j < len) {
5657 SPLIT_APPEND(data, j, len);
5658 }
5659
5660 Py_DECREF(string);
5661 return list;
5662
5663 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005664 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 Py_DECREF(string);
5666 return NULL;
5667}
5668
Tim Petersced69f82003-09-16 20:30:58 +00005669static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670PyObject *split_char(PyUnicodeObject *self,
5671 PyObject *list,
5672 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005673 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005675 register Py_ssize_t i;
5676 register Py_ssize_t j;
5677 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 PyObject *str;
5679
5680 for (i = j = 0; i < len; ) {
5681 if (self->str[i] == ch) {
5682 if (maxcount-- <= 0)
5683 break;
5684 SPLIT_APPEND(self->str, j, i);
5685 i = j = i + 1;
5686 } else
5687 i++;
5688 }
5689 if (j <= len) {
5690 SPLIT_APPEND(self->str, j, len);
5691 }
5692 return list;
5693
5694 onError:
5695 Py_DECREF(list);
5696 return NULL;
5697}
5698
Tim Petersced69f82003-09-16 20:30:58 +00005699static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700PyObject *split_substring(PyUnicodeObject *self,
5701 PyObject *list,
5702 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005703 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005705 register Py_ssize_t i;
5706 register Py_ssize_t j;
5707 Py_ssize_t len = self->length;
5708 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 PyObject *str;
5710
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005711 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 if (Py_UNICODE_MATCH(self, i, substring)) {
5713 if (maxcount-- <= 0)
5714 break;
5715 SPLIT_APPEND(self->str, j, i);
5716 i = j = i + sublen;
5717 } else
5718 i++;
5719 }
5720 if (j <= len) {
5721 SPLIT_APPEND(self->str, j, len);
5722 }
5723 return list;
5724
5725 onError:
5726 Py_DECREF(list);
5727 return NULL;
5728}
5729
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005730static
5731PyObject *rsplit_whitespace(PyUnicodeObject *self,
5732 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005733 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005734{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005735 register Py_ssize_t i;
5736 register Py_ssize_t j;
5737 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005738 PyObject *str;
5739
5740 for (i = j = len - 1; i >= 0; ) {
5741 /* find a token */
5742 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5743 i--;
5744 j = i;
5745 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5746 i--;
5747 if (j > i) {
5748 if (maxcount-- <= 0)
5749 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005750 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005751 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5752 i--;
5753 j = i;
5754 }
5755 }
5756 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005757 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005758 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005759 if (PyList_Reverse(list) < 0)
5760 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005761 return list;
5762
5763 onError:
5764 Py_DECREF(list);
5765 return NULL;
5766}
5767
5768static
5769PyObject *rsplit_char(PyUnicodeObject *self,
5770 PyObject *list,
5771 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005772 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005773{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005774 register Py_ssize_t i;
5775 register Py_ssize_t j;
5776 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005777 PyObject *str;
5778
5779 for (i = j = len - 1; i >= 0; ) {
5780 if (self->str[i] == ch) {
5781 if (maxcount-- <= 0)
5782 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005783 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005784 j = i = i - 1;
5785 } else
5786 i--;
5787 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005788 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005789 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005790 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005791 if (PyList_Reverse(list) < 0)
5792 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005793 return list;
5794
5795 onError:
5796 Py_DECREF(list);
5797 return NULL;
5798}
5799
5800static
5801PyObject *rsplit_substring(PyUnicodeObject *self,
5802 PyObject *list,
5803 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005804 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005805{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005806 register Py_ssize_t i;
5807 register Py_ssize_t j;
5808 Py_ssize_t len = self->length;
5809 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005810 PyObject *str;
5811
5812 for (i = len - sublen, j = len; i >= 0; ) {
5813 if (Py_UNICODE_MATCH(self, i, substring)) {
5814 if (maxcount-- <= 0)
5815 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005816 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005817 j = i;
5818 i -= sublen;
5819 } else
5820 i--;
5821 }
5822 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005823 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005824 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005825 if (PyList_Reverse(list) < 0)
5826 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005827 return list;
5828
5829 onError:
5830 Py_DECREF(list);
5831 return NULL;
5832}
5833
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834#undef SPLIT_APPEND
5835
5836static
5837PyObject *split(PyUnicodeObject *self,
5838 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005839 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840{
5841 PyObject *list;
5842
5843 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005844 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845
5846 list = PyList_New(0);
5847 if (!list)
5848 return NULL;
5849
5850 if (substring == NULL)
5851 return split_whitespace(self,list,maxcount);
5852
5853 else if (substring->length == 1)
5854 return split_char(self,list,substring->str[0],maxcount);
5855
5856 else if (substring->length == 0) {
5857 Py_DECREF(list);
5858 PyErr_SetString(PyExc_ValueError, "empty separator");
5859 return NULL;
5860 }
5861 else
5862 return split_substring(self,list,substring,maxcount);
5863}
5864
Tim Petersced69f82003-09-16 20:30:58 +00005865static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005866PyObject *rsplit(PyUnicodeObject *self,
5867 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005869{
5870 PyObject *list;
5871
5872 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005873 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005874
5875 list = PyList_New(0);
5876 if (!list)
5877 return NULL;
5878
5879 if (substring == NULL)
5880 return rsplit_whitespace(self,list,maxcount);
5881
5882 else if (substring->length == 1)
5883 return rsplit_char(self,list,substring->str[0],maxcount);
5884
5885 else if (substring->length == 0) {
5886 Py_DECREF(list);
5887 PyErr_SetString(PyExc_ValueError, "empty separator");
5888 return NULL;
5889 }
5890 else
5891 return rsplit_substring(self,list,substring,maxcount);
5892}
5893
5894static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895PyObject *replace(PyUnicodeObject *self,
5896 PyUnicodeObject *str1,
5897 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005898 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899{
5900 PyUnicodeObject *u;
5901
5902 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005903 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904
Thomas Wouters477c8d52006-05-27 19:21:47 +00005905 if (str1->length == str2->length) {
5906 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005907 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005908 if (str1->length == 1) {
5909 /* replace characters */
5910 Py_UNICODE u1, u2;
5911 if (!findchar(self->str, self->length, str1->str[0]))
5912 goto nothing;
5913 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5914 if (!u)
5915 return NULL;
5916 Py_UNICODE_COPY(u->str, self->str, self->length);
5917 u1 = str1->str[0];
5918 u2 = str2->str[0];
5919 for (i = 0; i < u->length; i++)
5920 if (u->str[i] == u1) {
5921 if (--maxcount < 0)
5922 break;
5923 u->str[i] = u2;
5924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005926 i = fastsearch(
5927 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005929 if (i < 0)
5930 goto nothing;
5931 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5932 if (!u)
5933 return NULL;
5934 Py_UNICODE_COPY(u->str, self->str, self->length);
5935 while (i <= self->length - str1->length)
5936 if (Py_UNICODE_MATCH(self, i, str1)) {
5937 if (--maxcount < 0)
5938 break;
5939 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5940 i += str1->length;
5941 } else
5942 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005945
5946 Py_ssize_t n, i, j, e;
5947 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 Py_UNICODE *p;
5949
5950 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005951 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 if (n > maxcount)
5953 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005954 if (n == 0)
5955 goto nothing;
5956 /* new_size = self->length + n * (str2->length - str1->length)); */
5957 delta = (str2->length - str1->length);
5958 if (delta == 0) {
5959 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005961 product = n * (str2->length - str1->length);
5962 if ((product / (str2->length - str1->length)) != n) {
5963 PyErr_SetString(PyExc_OverflowError,
5964 "replace string is too long");
5965 return NULL;
5966 }
5967 new_size = self->length + product;
5968 if (new_size < 0) {
5969 PyErr_SetString(PyExc_OverflowError,
5970 "replace string is too long");
5971 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 }
5973 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005974 u = _PyUnicode_New(new_size);
5975 if (!u)
5976 return NULL;
5977 i = 0;
5978 p = u->str;
5979 e = self->length - str1->length;
5980 if (str1->length > 0) {
5981 while (n-- > 0) {
5982 /* look for next match */
5983 j = i;
5984 while (j <= e) {
5985 if (Py_UNICODE_MATCH(self, j, str1))
5986 break;
5987 j++;
5988 }
5989 if (j > i) {
5990 if (j > e)
5991 break;
5992 /* copy unchanged part [i:j] */
5993 Py_UNICODE_COPY(p, self->str+i, j-i);
5994 p += j - i;
5995 }
5996 /* copy substitution string */
5997 if (str2->length > 0) {
5998 Py_UNICODE_COPY(p, str2->str, str2->length);
5999 p += str2->length;
6000 }
6001 i = j + str1->length;
6002 }
6003 if (i < self->length)
6004 /* copy tail [i:] */
6005 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6006 } else {
6007 /* interleave */
6008 while (n > 0) {
6009 Py_UNICODE_COPY(p, str2->str, str2->length);
6010 p += str2->length;
6011 if (--n <= 0)
6012 break;
6013 *p++ = self->str[i++];
6014 }
6015 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006019
6020nothing:
6021 /* nothing to replace; return original string (when possible) */
6022 if (PyUnicode_CheckExact(self)) {
6023 Py_INCREF(self);
6024 return (PyObject *) self;
6025 }
6026 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027}
6028
6029/* --- Unicode Object Methods --------------------------------------------- */
6030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006031PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032"S.title() -> unicode\n\
6033\n\
6034Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006035characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036
6037static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006038unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 return fixup(self, fixtitle);
6041}
6042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006043PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044"S.capitalize() -> unicode\n\
6045\n\
6046Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006047have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048
6049static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006050unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 return fixup(self, fixcapitalize);
6053}
6054
6055#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006056PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057"S.capwords() -> unicode\n\
6058\n\
6059Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006060normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061
6062static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006063unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064{
6065 PyObject *list;
6066 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006067 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 /* Split into words */
6070 list = split(self, NULL, -1);
6071 if (!list)
6072 return NULL;
6073
6074 /* Capitalize each word */
6075 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6076 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6077 fixcapitalize);
6078 if (item == NULL)
6079 goto onError;
6080 Py_DECREF(PyList_GET_ITEM(list, i));
6081 PyList_SET_ITEM(list, i, item);
6082 }
6083
6084 /* Join the words to form a new string */
6085 item = PyUnicode_Join(NULL, list);
6086
6087onError:
6088 Py_DECREF(list);
6089 return (PyObject *)item;
6090}
6091#endif
6092
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006093/* Argument converter. Coerces to a single unicode character */
6094
6095static int
6096convert_uc(PyObject *obj, void *addr)
6097{
6098 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6099 PyObject *uniobj;
6100 Py_UNICODE *unistr;
6101
6102 uniobj = PyUnicode_FromObject(obj);
6103 if (uniobj == NULL) {
6104 PyErr_SetString(PyExc_TypeError,
6105 "The fill character cannot be converted to Unicode");
6106 return 0;
6107 }
6108 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6109 PyErr_SetString(PyExc_TypeError,
6110 "The fill character must be exactly one character long");
6111 Py_DECREF(uniobj);
6112 return 0;
6113 }
6114 unistr = PyUnicode_AS_UNICODE(uniobj);
6115 *fillcharloc = unistr[0];
6116 Py_DECREF(uniobj);
6117 return 1;
6118}
6119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006120PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006121"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006123Return S centered in a Unicode string of length width. Padding is\n\
6124done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125
6126static PyObject *
6127unicode_center(PyUnicodeObject *self, PyObject *args)
6128{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006129 Py_ssize_t marg, left;
6130 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006131 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132
Thomas Woutersde017742006-02-16 19:34:37 +00006133 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 return NULL;
6135
Tim Peters7a29bd52001-09-12 03:03:31 +00006136 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 Py_INCREF(self);
6138 return (PyObject*) self;
6139 }
6140
6141 marg = width - self->length;
6142 left = marg / 2 + (marg & width & 1);
6143
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006144 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145}
6146
Marc-André Lemburge5034372000-08-08 08:04:29 +00006147#if 0
6148
6149/* This code should go into some future Unicode collation support
6150 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006151 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006152
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006153/* speedy UTF-16 code point order comparison */
6154/* gleaned from: */
6155/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6156
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006157static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006158{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006159 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006160 0, 0, 0, 0, 0, 0, 0, 0,
6161 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006162 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006163};
6164
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165static int
6166unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6167{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006168 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006169
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 Py_UNICODE *s1 = str1->str;
6171 Py_UNICODE *s2 = str2->str;
6172
6173 len1 = str1->length;
6174 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006175
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006177 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006178
6179 c1 = *s1++;
6180 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006181
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006182 if (c1 > (1<<11) * 26)
6183 c1 += utf16Fixup[c1>>11];
6184 if (c2 > (1<<11) * 26)
6185 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006186 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006187
6188 if (c1 != c2)
6189 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006190
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006191 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 }
6193
6194 return (len1 < len2) ? -1 : (len1 != len2);
6195}
6196
Marc-André Lemburge5034372000-08-08 08:04:29 +00006197#else
6198
6199static int
6200unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6201{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006202 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006203
6204 Py_UNICODE *s1 = str1->str;
6205 Py_UNICODE *s2 = str2->str;
6206
6207 len1 = str1->length;
6208 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006209
Marc-André Lemburge5034372000-08-08 08:04:29 +00006210 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006211 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006212
Fredrik Lundh45714e92001-06-26 16:39:36 +00006213 c1 = *s1++;
6214 c2 = *s2++;
6215
6216 if (c1 != c2)
6217 return (c1 < c2) ? -1 : 1;
6218
Marc-André Lemburge5034372000-08-08 08:04:29 +00006219 len1--; len2--;
6220 }
6221
6222 return (len1 < len2) ? -1 : (len1 != len2);
6223}
6224
6225#endif
6226
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227int PyUnicode_Compare(PyObject *left,
6228 PyObject *right)
6229{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006230 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6231 return unicode_compare((PyUnicodeObject *)left,
6232 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006233 PyErr_Format(PyExc_TypeError,
6234 "Can't compare %.100s and %.100s",
6235 left->ob_type->tp_name,
6236 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 return -1;
6238}
6239
Martin v. Löwis5b222132007-06-10 09:51:05 +00006240int
6241PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6242{
6243 int i;
6244 Py_UNICODE *id;
6245 assert(PyUnicode_Check(uni));
6246 id = PyUnicode_AS_UNICODE(uni);
6247 /* Compare Unicode string and source character set string */
6248 for (i = 0; id[i] && str[i]; i++)
6249 if (id[i] != str[i])
6250 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6251 if (id[i])
6252 return 1; /* uni is longer */
6253 if (str[i])
6254 return -1; /* str is longer */
6255 return 0;
6256}
6257
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006258PyObject *PyUnicode_RichCompare(PyObject *left,
6259 PyObject *right,
6260 int op)
6261{
6262 int result;
6263
6264 result = PyUnicode_Compare(left, right);
6265 if (result == -1 && PyErr_Occurred())
6266 goto onError;
6267
6268 /* Convert the return value to a Boolean */
6269 switch (op) {
6270 case Py_EQ:
6271 result = (result == 0);
6272 break;
6273 case Py_NE:
6274 result = (result != 0);
6275 break;
6276 case Py_LE:
6277 result = (result <= 0);
6278 break;
6279 case Py_GE:
6280 result = (result >= 0);
6281 break;
6282 case Py_LT:
6283 result = (result == -1);
6284 break;
6285 case Py_GT:
6286 result = (result == 1);
6287 break;
6288 }
6289 return PyBool_FromLong(result);
6290
6291 onError:
6292
6293 /* Standard case
6294
6295 Type errors mean that PyUnicode_FromObject() could not convert
6296 one of the arguments (usually the right hand side) to Unicode,
6297 ie. we can't handle the comparison request. However, it is
6298 possible that the other object knows a comparison method, which
6299 is why we return Py_NotImplemented to give the other object a
6300 chance.
6301
6302 */
6303 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6304 PyErr_Clear();
6305 Py_INCREF(Py_NotImplemented);
6306 return Py_NotImplemented;
6307 }
6308 if (op != Py_EQ && op != Py_NE)
6309 return NULL;
6310
6311 /* Equality comparison.
6312
6313 This is a special case: we silence any PyExc_UnicodeDecodeError
6314 and instead turn it into a PyErr_UnicodeWarning.
6315
6316 */
6317 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6318 return NULL;
6319 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006320 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6321 (op == Py_EQ) ?
6322 "Unicode equal comparison "
6323 "failed to convert both arguments to Unicode - "
6324 "interpreting them as being unequal"
6325 :
6326 "Unicode unequal comparison "
6327 "failed to convert both arguments to Unicode - "
6328 "interpreting them as being unequal",
6329 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006330 return NULL;
6331 result = (op == Py_NE);
6332 return PyBool_FromLong(result);
6333}
6334
Guido van Rossum403d68b2000-03-13 15:55:09 +00006335int PyUnicode_Contains(PyObject *container,
6336 PyObject *element)
6337{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006338 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006339 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006340
6341 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006342 sub = PyUnicode_FromObject(element);
6343 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006344 PyErr_Format(PyExc_TypeError,
6345 "'in <string>' requires string as left operand, not %s",
6346 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006347 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006348 }
6349
Thomas Wouters477c8d52006-05-27 19:21:47 +00006350 str = PyUnicode_FromObject(container);
6351 if (!str) {
6352 Py_DECREF(sub);
6353 return -1;
6354 }
6355
6356 result = stringlib_contains_obj(str, sub);
6357
6358 Py_DECREF(str);
6359 Py_DECREF(sub);
6360
Guido van Rossum403d68b2000-03-13 15:55:09 +00006361 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006362}
6363
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364/* Concat to string or Unicode object giving a new Unicode object. */
6365
6366PyObject *PyUnicode_Concat(PyObject *left,
6367 PyObject *right)
6368{
6369 PyUnicodeObject *u = NULL, *v = NULL, *w;
6370
6371 /* Coerce the two arguments */
6372 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6373 if (u == NULL)
6374 goto onError;
6375 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6376 if (v == NULL)
6377 goto onError;
6378
6379 /* Shortcuts */
6380 if (v == unicode_empty) {
6381 Py_DECREF(v);
6382 return (PyObject *)u;
6383 }
6384 if (u == unicode_empty) {
6385 Py_DECREF(u);
6386 return (PyObject *)v;
6387 }
6388
6389 /* Concat the two Unicode strings */
6390 w = _PyUnicode_New(u->length + v->length);
6391 if (w == NULL)
6392 goto onError;
6393 Py_UNICODE_COPY(w->str, u->str, u->length);
6394 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6395
6396 Py_DECREF(u);
6397 Py_DECREF(v);
6398 return (PyObject *)w;
6399
6400onError:
6401 Py_XDECREF(u);
6402 Py_XDECREF(v);
6403 return NULL;
6404}
6405
Walter Dörwald1ab83302007-05-18 17:15:44 +00006406void
6407PyUnicode_Append(PyObject **pleft, PyObject *right)
6408{
6409 PyObject *new;
6410 if (*pleft == NULL)
6411 return;
6412 if (right == NULL || !PyUnicode_Check(*pleft)) {
6413 Py_DECREF(*pleft);
6414 *pleft = NULL;
6415 return;
6416 }
6417 new = PyUnicode_Concat(*pleft, right);
6418 Py_DECREF(*pleft);
6419 *pleft = new;
6420}
6421
6422void
6423PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6424{
6425 PyUnicode_Append(pleft, right);
6426 Py_XDECREF(right);
6427}
6428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006429PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430"S.count(sub[, start[, end]]) -> int\n\
6431\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006432Return the number of non-overlapping occurrences of substring sub in\n\
6433Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006434interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435
6436static PyObject *
6437unicode_count(PyUnicodeObject *self, PyObject *args)
6438{
6439 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006440 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006441 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 PyObject *result;
6443
Guido van Rossumb8872e62000-05-09 14:14:27 +00006444 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6445 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 return NULL;
6447
6448 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006449 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 if (substring == NULL)
6451 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006452
Thomas Wouters477c8d52006-05-27 19:21:47 +00006453 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454
Thomas Wouters477c8d52006-05-27 19:21:47 +00006455 result = PyInt_FromSsize_t(
6456 stringlib_count(self->str + start, end - start,
6457 substring->str, substring->length)
6458 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459
6460 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006461
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 return result;
6463}
6464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006465PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006466"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006468Encodes S using the codec registered for encoding. encoding defaults\n\
6469to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006470handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006471a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6472'xmlcharrefreplace' as well as any other name registered with\n\
6473codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474
6475static PyObject *
6476unicode_encode(PyUnicodeObject *self, PyObject *args)
6477{
6478 char *encoding = NULL;
6479 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006480 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006481
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6483 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006484 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006485 if (v == NULL)
6486 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00006487 if (!PyString_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006488 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006489 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006490 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006491 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006492 Py_DECREF(v);
6493 return NULL;
6494 }
6495 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006496
6497 onError:
6498 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006499}
6500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006501PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502"S.expandtabs([tabsize]) -> unicode\n\
6503\n\
6504Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006505If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506
6507static PyObject*
6508unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6509{
6510 Py_UNICODE *e;
6511 Py_UNICODE *p;
6512 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006513 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 PyUnicodeObject *u;
6515 int tabsize = 8;
6516
6517 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6518 return NULL;
6519
Thomas Wouters7e474022000-07-16 12:04:32 +00006520 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006521 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 e = self->str + self->length;
6523 for (p = self->str; p < e; p++)
6524 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006525 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006527 if (old_j > j) {
6528 PyErr_SetString(PyExc_OverflowError,
6529 "new string is too long");
6530 return NULL;
6531 }
6532 old_j = j;
6533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 }
6535 else {
6536 j++;
6537 if (*p == '\n' || *p == '\r') {
6538 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006539 old_j = j = 0;
6540 if (i < 0) {
6541 PyErr_SetString(PyExc_OverflowError,
6542 "new string is too long");
6543 return NULL;
6544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 }
6546 }
6547
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006548 if ((i + j) < 0) {
6549 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6550 return NULL;
6551 }
6552
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 /* Second pass: create output string and fill it */
6554 u = _PyUnicode_New(i + j);
6555 if (!u)
6556 return NULL;
6557
6558 j = 0;
6559 q = u->str;
6560
6561 for (p = self->str; p < e; p++)
6562 if (*p == '\t') {
6563 if (tabsize > 0) {
6564 i = tabsize - (j % tabsize);
6565 j += i;
6566 while (i--)
6567 *q++ = ' ';
6568 }
6569 }
6570 else {
6571 j++;
6572 *q++ = *p;
6573 if (*p == '\n' || *p == '\r')
6574 j = 0;
6575 }
6576
6577 return (PyObject*) u;
6578}
6579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006580PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581"S.find(sub [,start [,end]]) -> int\n\
6582\n\
6583Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006584such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585arguments start and end are interpreted as in slice notation.\n\
6586\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006587Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588
6589static PyObject *
6590unicode_find(PyUnicodeObject *self, PyObject *args)
6591{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006592 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006593 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006594 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006595 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
Guido van Rossumb8872e62000-05-09 14:14:27 +00006597 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6598 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006600 substring = PyUnicode_FromObject(substring);
6601 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 return NULL;
6603
Thomas Wouters477c8d52006-05-27 19:21:47 +00006604 result = stringlib_find_slice(
6605 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6606 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6607 start, end
6608 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609
6610 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006611
6612 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613}
6614
6615static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006616unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617{
6618 if (index < 0 || index >= self->length) {
6619 PyErr_SetString(PyExc_IndexError, "string index out of range");
6620 return NULL;
6621 }
6622
6623 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6624}
6625
Guido van Rossumc2504932007-09-18 19:42:40 +00006626/* Believe it or not, this produces the same value for ASCII strings
6627 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006629unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630{
Guido van Rossumc2504932007-09-18 19:42:40 +00006631 Py_ssize_t len;
6632 Py_UNICODE *p;
6633 long x;
6634
6635 if (self->hash != -1)
6636 return self->hash;
6637 len = Py_Size(self);
6638 p = self->str;
6639 x = *p << 7;
6640 while (--len >= 0)
6641 x = (1000003*x) ^ *p++;
6642 x ^= Py_Size(self);
6643 if (x == -1)
6644 x = -2;
6645 self->hash = x;
6646 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647}
6648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006649PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650"S.index(sub [,start [,end]]) -> int\n\
6651\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006652Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653
6654static PyObject *
6655unicode_index(PyUnicodeObject *self, PyObject *args)
6656{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006657 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006658 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006659 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006660 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661
Guido van Rossumb8872e62000-05-09 14:14:27 +00006662 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6663 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006665 substring = PyUnicode_FromObject(substring);
6666 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 return NULL;
6668
Thomas Wouters477c8d52006-05-27 19:21:47 +00006669 result = stringlib_find_slice(
6670 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6671 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6672 start, end
6673 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674
6675 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006676
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 if (result < 0) {
6678 PyErr_SetString(PyExc_ValueError, "substring not found");
6679 return NULL;
6680 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006681
Martin v. Löwis18e16552006-02-15 17:27:45 +00006682 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683}
6684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006685PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006686"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006688Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006689at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
6691static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006692unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693{
6694 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6695 register const Py_UNICODE *e;
6696 int cased;
6697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 /* Shortcut for single character strings */
6699 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006700 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006702 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006703 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006704 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006705
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 e = p + PyUnicode_GET_SIZE(self);
6707 cased = 0;
6708 for (; p < e; p++) {
6709 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006710
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006712 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 else if (!cased && Py_UNICODE_ISLOWER(ch))
6714 cased = 1;
6715 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006716 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006719PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006720"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006722Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006723at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724
6725static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006726unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727{
6728 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6729 register const Py_UNICODE *e;
6730 int cased;
6731
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732 /* Shortcut for single character strings */
6733 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006734 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006736 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006737 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006738 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006739
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 e = p + PyUnicode_GET_SIZE(self);
6741 cased = 0;
6742 for (; p < e; p++) {
6743 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006744
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006746 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 else if (!cased && Py_UNICODE_ISUPPER(ch))
6748 cased = 1;
6749 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006750 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751}
6752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006753PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006754"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006756Return True if S is a titlecased string and there is at least one\n\
6757character in S, i.e. upper- and titlecase characters may only\n\
6758follow uncased characters and lowercase characters only cased ones.\n\
6759Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760
6761static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006762unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763{
6764 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6765 register const Py_UNICODE *e;
6766 int cased, previous_is_cased;
6767
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768 /* Shortcut for single character strings */
6769 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006770 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6771 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006773 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006774 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006775 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006776
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 e = p + PyUnicode_GET_SIZE(self);
6778 cased = 0;
6779 previous_is_cased = 0;
6780 for (; p < e; p++) {
6781 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006782
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6784 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006785 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 previous_is_cased = 1;
6787 cased = 1;
6788 }
6789 else if (Py_UNICODE_ISLOWER(ch)) {
6790 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006791 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 previous_is_cased = 1;
6793 cased = 1;
6794 }
6795 else
6796 previous_is_cased = 0;
6797 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006798 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799}
6800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006801PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006802"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006804Return True if all characters in S are whitespace\n\
6805and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806
6807static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006808unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809{
6810 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6811 register const Py_UNICODE *e;
6812
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 /* Shortcut for single character strings */
6814 if (PyUnicode_GET_SIZE(self) == 1 &&
6815 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006816 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006818 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006819 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006820 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006821
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 e = p + PyUnicode_GET_SIZE(self);
6823 for (; p < e; p++) {
6824 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006825 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006827 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828}
6829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006830PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006831"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006832\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006833Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006834and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006835
6836static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006837unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006838{
6839 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6840 register const Py_UNICODE *e;
6841
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006842 /* Shortcut for single character strings */
6843 if (PyUnicode_GET_SIZE(self) == 1 &&
6844 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006845 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006846
6847 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006848 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006849 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006850
6851 e = p + PyUnicode_GET_SIZE(self);
6852 for (; p < e; p++) {
6853 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006854 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006855 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006856 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006857}
6858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006859PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006860"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006861\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006862Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006863and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006864
6865static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006866unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006867{
6868 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6869 register const Py_UNICODE *e;
6870
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006871 /* Shortcut for single character strings */
6872 if (PyUnicode_GET_SIZE(self) == 1 &&
6873 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006874 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006875
6876 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006877 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006878 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006879
6880 e = p + PyUnicode_GET_SIZE(self);
6881 for (; p < e; p++) {
6882 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006883 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006884 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006885 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006886}
6887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006888PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006889"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006891Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006892False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893
6894static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006895unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896{
6897 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6898 register const Py_UNICODE *e;
6899
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900 /* Shortcut for single character strings */
6901 if (PyUnicode_GET_SIZE(self) == 1 &&
6902 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006903 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006905 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006906 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006907 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006908
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 e = p + PyUnicode_GET_SIZE(self);
6910 for (; p < e; p++) {
6911 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006912 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006914 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915}
6916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006917PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006918"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006920Return True if all characters in S are digits\n\
6921and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922
6923static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006924unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925{
6926 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6927 register const Py_UNICODE *e;
6928
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 /* Shortcut for single character strings */
6930 if (PyUnicode_GET_SIZE(self) == 1 &&
6931 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006932 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006934 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006935 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006936 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006937
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 e = p + PyUnicode_GET_SIZE(self);
6939 for (; p < e; p++) {
6940 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006941 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006943 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944}
6945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006946PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006947"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006949Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006950False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951
6952static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006953unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954{
6955 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6956 register const Py_UNICODE *e;
6957
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 /* Shortcut for single character strings */
6959 if (PyUnicode_GET_SIZE(self) == 1 &&
6960 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006961 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006963 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006964 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006965 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006966
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967 e = p + PyUnicode_GET_SIZE(self);
6968 for (; p < e; p++) {
6969 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006970 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006972 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973}
6974
Martin v. Löwis47383402007-08-15 07:32:56 +00006975int
6976PyUnicode_IsIdentifier(PyObject *self)
6977{
6978 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6979 register const Py_UNICODE *e;
6980
6981 /* Special case for empty strings */
6982 if (PyUnicode_GET_SIZE(self) == 0)
6983 return 0;
6984
6985 /* PEP 3131 says that the first character must be in
6986 XID_Start and subsequent characters in XID_Continue,
6987 and for the ASCII range, the 2.x rules apply (i.e
6988 start with letters and underscore, continue with
6989 letters, digits, underscore). However, given the current
6990 definition of XID_Start and XID_Continue, it is sufficient
6991 to check just for these, except that _ must be allowed
6992 as starting an identifier. */
6993 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6994 return 0;
6995
6996 e = p + PyUnicode_GET_SIZE(self);
6997 for (p++; p < e; p++) {
6998 if (!_PyUnicode_IsXidContinue(*p))
6999 return 0;
7000 }
7001 return 1;
7002}
7003
7004PyDoc_STRVAR(isidentifier__doc__,
7005"S.isidentifier() -> bool\n\
7006\n\
7007Return True if S is a valid identifier according\n\
7008to the language definition.");
7009
7010static PyObject*
7011unicode_isidentifier(PyObject *self)
7012{
7013 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7014}
7015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007016PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017"S.join(sequence) -> unicode\n\
7018\n\
7019Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007020sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021
7022static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007023unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007025 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026}
7027
Martin v. Löwis18e16552006-02-15 17:27:45 +00007028static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029unicode_length(PyUnicodeObject *self)
7030{
7031 return self->length;
7032}
7033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007034PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007035"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036\n\
7037Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007038done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039
7040static PyObject *
7041unicode_ljust(PyUnicodeObject *self, PyObject *args)
7042{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007043 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007044 Py_UNICODE fillchar = ' ';
7045
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007046 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047 return NULL;
7048
Tim Peters7a29bd52001-09-12 03:03:31 +00007049 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 Py_INCREF(self);
7051 return (PyObject*) self;
7052 }
7053
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007054 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055}
7056
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007057PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058"S.lower() -> unicode\n\
7059\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007060Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061
7062static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007063unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 return fixup(self, fixlower);
7066}
7067
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007068#define LEFTSTRIP 0
7069#define RIGHTSTRIP 1
7070#define BOTHSTRIP 2
7071
7072/* Arrays indexed by above */
7073static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7074
7075#define STRIPNAME(i) (stripformat[i]+3)
7076
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007077/* externally visible for str.strip(unicode) */
7078PyObject *
7079_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7080{
7081 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007082 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007083 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007084 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7085 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007086
Thomas Wouters477c8d52006-05-27 19:21:47 +00007087 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7088
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007089 i = 0;
7090 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007091 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7092 i++;
7093 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007094 }
7095
7096 j = len;
7097 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007098 do {
7099 j--;
7100 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7101 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007102 }
7103
7104 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007105 Py_INCREF(self);
7106 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007107 }
7108 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007109 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007110}
7111
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112
7113static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007114do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007116 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007117 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007118
7119 i = 0;
7120 if (striptype != RIGHTSTRIP) {
7121 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7122 i++;
7123 }
7124 }
7125
7126 j = len;
7127 if (striptype != LEFTSTRIP) {
7128 do {
7129 j--;
7130 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7131 j++;
7132 }
7133
7134 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7135 Py_INCREF(self);
7136 return (PyObject*)self;
7137 }
7138 else
7139 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140}
7141
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007142
7143static PyObject *
7144do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7145{
7146 PyObject *sep = NULL;
7147
7148 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7149 return NULL;
7150
7151 if (sep != NULL && sep != Py_None) {
7152 if (PyUnicode_Check(sep))
7153 return _PyUnicode_XStrip(self, striptype, sep);
7154 else if (PyString_Check(sep)) {
7155 PyObject *res;
7156 sep = PyUnicode_FromObject(sep);
7157 if (sep==NULL)
7158 return NULL;
7159 res = _PyUnicode_XStrip(self, striptype, sep);
7160 Py_DECREF(sep);
7161 return res;
7162 }
7163 else {
7164 PyErr_Format(PyExc_TypeError,
7165 "%s arg must be None, unicode or str",
7166 STRIPNAME(striptype));
7167 return NULL;
7168 }
7169 }
7170
7171 return do_strip(self, striptype);
7172}
7173
7174
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007175PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007176"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007177\n\
7178Return a copy of the string S with leading and trailing\n\
7179whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007180If chars is given and not None, remove characters in chars instead.\n\
7181If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007182
7183static PyObject *
7184unicode_strip(PyUnicodeObject *self, PyObject *args)
7185{
7186 if (PyTuple_GET_SIZE(args) == 0)
7187 return do_strip(self, BOTHSTRIP); /* Common case */
7188 else
7189 return do_argstrip(self, BOTHSTRIP, args);
7190}
7191
7192
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007193PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007194"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007195\n\
7196Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007197If chars is given and not None, remove characters in chars instead.\n\
7198If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007199
7200static PyObject *
7201unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7202{
7203 if (PyTuple_GET_SIZE(args) == 0)
7204 return do_strip(self, LEFTSTRIP); /* Common case */
7205 else
7206 return do_argstrip(self, LEFTSTRIP, args);
7207}
7208
7209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007210PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007211"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007212\n\
7213Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007214If chars is given and not None, remove characters in chars instead.\n\
7215If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007216
7217static PyObject *
7218unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7219{
7220 if (PyTuple_GET_SIZE(args) == 0)
7221 return do_strip(self, RIGHTSTRIP); /* Common case */
7222 else
7223 return do_argstrip(self, RIGHTSTRIP, args);
7224}
7225
7226
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007228unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229{
7230 PyUnicodeObject *u;
7231 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007232 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007233 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234
7235 if (len < 0)
7236 len = 0;
7237
Tim Peters7a29bd52001-09-12 03:03:31 +00007238 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 /* no repeat, return original string */
7240 Py_INCREF(str);
7241 return (PyObject*) str;
7242 }
Tim Peters8f422462000-09-09 06:13:41 +00007243
7244 /* ensure # of chars needed doesn't overflow int and # of bytes
7245 * needed doesn't overflow size_t
7246 */
7247 nchars = len * str->length;
7248 if (len && nchars / len != str->length) {
7249 PyErr_SetString(PyExc_OverflowError,
7250 "repeated string is too long");
7251 return NULL;
7252 }
7253 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7254 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7255 PyErr_SetString(PyExc_OverflowError,
7256 "repeated string is too long");
7257 return NULL;
7258 }
7259 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 if (!u)
7261 return NULL;
7262
7263 p = u->str;
7264
Thomas Wouters477c8d52006-05-27 19:21:47 +00007265 if (str->length == 1 && len > 0) {
7266 Py_UNICODE_FILL(p, str->str[0], len);
7267 } else {
7268 Py_ssize_t done = 0; /* number of characters copied this far */
7269 if (done < nchars) {
7270 Py_UNICODE_COPY(p, str->str, str->length);
7271 done = str->length;
7272 }
7273 while (done < nchars) {
7274 int n = (done <= nchars-done) ? done : nchars-done;
7275 Py_UNICODE_COPY(p+done, p, n);
7276 done += n;
7277 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278 }
7279
7280 return (PyObject*) u;
7281}
7282
7283PyObject *PyUnicode_Replace(PyObject *obj,
7284 PyObject *subobj,
7285 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007286 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287{
7288 PyObject *self;
7289 PyObject *str1;
7290 PyObject *str2;
7291 PyObject *result;
7292
7293 self = PyUnicode_FromObject(obj);
7294 if (self == NULL)
7295 return NULL;
7296 str1 = PyUnicode_FromObject(subobj);
7297 if (str1 == NULL) {
7298 Py_DECREF(self);
7299 return NULL;
7300 }
7301 str2 = PyUnicode_FromObject(replobj);
7302 if (str2 == NULL) {
7303 Py_DECREF(self);
7304 Py_DECREF(str1);
7305 return NULL;
7306 }
Tim Petersced69f82003-09-16 20:30:58 +00007307 result = replace((PyUnicodeObject *)self,
7308 (PyUnicodeObject *)str1,
7309 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 maxcount);
7311 Py_DECREF(self);
7312 Py_DECREF(str1);
7313 Py_DECREF(str2);
7314 return result;
7315}
7316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007317PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318"S.replace (old, new[, maxsplit]) -> unicode\n\
7319\n\
7320Return a copy of S with all occurrences of substring\n\
7321old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007322given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323
7324static PyObject*
7325unicode_replace(PyUnicodeObject *self, PyObject *args)
7326{
7327 PyUnicodeObject *str1;
7328 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007329 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 PyObject *result;
7331
Martin v. Löwis18e16552006-02-15 17:27:45 +00007332 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333 return NULL;
7334 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7335 if (str1 == NULL)
7336 return NULL;
7337 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007338 if (str2 == NULL) {
7339 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007341 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342
7343 result = replace(self, str1, str2, maxcount);
7344
7345 Py_DECREF(str1);
7346 Py_DECREF(str2);
7347 return result;
7348}
7349
7350static
7351PyObject *unicode_repr(PyObject *unicode)
7352{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007353 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007354 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007355 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7356 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7357
7358 /* XXX(nnorwitz): rather than over-allocating, it would be
7359 better to choose a different scheme. Perhaps scan the
7360 first N-chars of the string and allocate based on that size.
7361 */
7362 /* Initial allocation is based on the longest-possible unichr
7363 escape.
7364
7365 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7366 unichr, so in this case it's the longest unichr escape. In
7367 narrow (UTF-16) builds this is five chars per source unichr
7368 since there are two unichrs in the surrogate pair, so in narrow
7369 (UTF-16) builds it's not the longest unichr escape.
7370
7371 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7372 so in the narrow (UTF-16) build case it's the longest unichr
7373 escape.
7374 */
7375
Walter Dörwald1ab83302007-05-18 17:15:44 +00007376 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007377 2 /* quotes */
7378#ifdef Py_UNICODE_WIDE
7379 + 10*size
7380#else
7381 + 6*size
7382#endif
7383 + 1);
7384 if (repr == NULL)
7385 return NULL;
7386
Walter Dörwald1ab83302007-05-18 17:15:44 +00007387 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007388
7389 /* Add quote */
7390 *p++ = (findchar(s, size, '\'') &&
7391 !findchar(s, size, '"')) ? '"' : '\'';
7392 while (size-- > 0) {
7393 Py_UNICODE ch = *s++;
7394
7395 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007396 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007397 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007398 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007399 continue;
7400 }
7401
7402#ifdef Py_UNICODE_WIDE
7403 /* Map 21-bit characters to '\U00xxxxxx' */
7404 else if (ch >= 0x10000) {
7405 *p++ = '\\';
7406 *p++ = 'U';
7407 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7408 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7409 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7410 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7411 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7412 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7413 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7414 *p++ = hexdigits[ch & 0x0000000F];
7415 continue;
7416 }
7417#else
7418 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7419 else if (ch >= 0xD800 && ch < 0xDC00) {
7420 Py_UNICODE ch2;
7421 Py_UCS4 ucs;
7422
7423 ch2 = *s++;
7424 size--;
7425 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7426 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7427 *p++ = '\\';
7428 *p++ = 'U';
7429 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7430 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7431 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7432 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7433 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7434 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7435 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7436 *p++ = hexdigits[ucs & 0x0000000F];
7437 continue;
7438 }
7439 /* Fall through: isolated surrogates are copied as-is */
7440 s--;
7441 size++;
7442 }
7443#endif
7444
7445 /* Map 16-bit characters to '\uxxxx' */
7446 if (ch >= 256) {
7447 *p++ = '\\';
7448 *p++ = 'u';
7449 *p++ = hexdigits[(ch >> 12) & 0x000F];
7450 *p++ = hexdigits[(ch >> 8) & 0x000F];
7451 *p++ = hexdigits[(ch >> 4) & 0x000F];
7452 *p++ = hexdigits[ch & 0x000F];
7453 }
7454
7455 /* Map special whitespace to '\t', \n', '\r' */
7456 else if (ch == '\t') {
7457 *p++ = '\\';
7458 *p++ = 't';
7459 }
7460 else if (ch == '\n') {
7461 *p++ = '\\';
7462 *p++ = 'n';
7463 }
7464 else if (ch == '\r') {
7465 *p++ = '\\';
7466 *p++ = 'r';
7467 }
7468
7469 /* Map non-printable US ASCII to '\xhh' */
7470 else if (ch < ' ' || ch >= 0x7F) {
7471 *p++ = '\\';
7472 *p++ = 'x';
7473 *p++ = hexdigits[(ch >> 4) & 0x000F];
7474 *p++ = hexdigits[ch & 0x000F];
7475 }
7476
7477 /* Copy everything else as-is */
7478 else
7479 *p++ = (char) ch;
7480 }
7481 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007482 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007483
7484 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007485 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007486 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487}
7488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007489PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490"S.rfind(sub [,start [,end]]) -> int\n\
7491\n\
7492Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007493such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494arguments start and end are interpreted as in slice notation.\n\
7495\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007496Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497
7498static PyObject *
7499unicode_rfind(PyUnicodeObject *self, PyObject *args)
7500{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007501 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007502 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007503 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007504 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505
Guido van Rossumb8872e62000-05-09 14:14:27 +00007506 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7507 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007509 substring = PyUnicode_FromObject(substring);
7510 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 return NULL;
7512
Thomas Wouters477c8d52006-05-27 19:21:47 +00007513 result = stringlib_rfind_slice(
7514 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7515 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7516 start, end
7517 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518
7519 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007520
7521 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522}
7523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007524PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525"S.rindex(sub [,start [,end]]) -> int\n\
7526\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007527Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528
7529static PyObject *
7530unicode_rindex(PyUnicodeObject *self, PyObject *args)
7531{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007532 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007533 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007534 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007535 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536
Guido van Rossumb8872e62000-05-09 14:14:27 +00007537 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7538 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007540 substring = PyUnicode_FromObject(substring);
7541 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542 return NULL;
7543
Thomas Wouters477c8d52006-05-27 19:21:47 +00007544 result = stringlib_rfind_slice(
7545 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7546 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7547 start, end
7548 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549
7550 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007551
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 if (result < 0) {
7553 PyErr_SetString(PyExc_ValueError, "substring not found");
7554 return NULL;
7555 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007556 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557}
7558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007559PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007560"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561\n\
7562Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007563done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564
7565static PyObject *
7566unicode_rjust(PyUnicodeObject *self, PyObject *args)
7567{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007568 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007569 Py_UNICODE fillchar = ' ';
7570
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007571 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 return NULL;
7573
Tim Peters7a29bd52001-09-12 03:03:31 +00007574 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575 Py_INCREF(self);
7576 return (PyObject*) self;
7577 }
7578
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007579 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580}
7581
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582PyObject *PyUnicode_Split(PyObject *s,
7583 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007584 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585{
7586 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007587
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588 s = PyUnicode_FromObject(s);
7589 if (s == NULL)
7590 return NULL;
7591 if (sep != NULL) {
7592 sep = PyUnicode_FromObject(sep);
7593 if (sep == NULL) {
7594 Py_DECREF(s);
7595 return NULL;
7596 }
7597 }
7598
7599 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7600
7601 Py_DECREF(s);
7602 Py_XDECREF(sep);
7603 return result;
7604}
7605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007606PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607"S.split([sep [,maxsplit]]) -> list of strings\n\
7608\n\
7609Return a list of the words in S, using sep as the\n\
7610delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007611splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007612any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613
7614static PyObject*
7615unicode_split(PyUnicodeObject *self, PyObject *args)
7616{
7617 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007618 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619
Martin v. Löwis18e16552006-02-15 17:27:45 +00007620 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621 return NULL;
7622
7623 if (substring == Py_None)
7624 return split(self, NULL, maxcount);
7625 else if (PyUnicode_Check(substring))
7626 return split(self, (PyUnicodeObject *)substring, maxcount);
7627 else
7628 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7629}
7630
Thomas Wouters477c8d52006-05-27 19:21:47 +00007631PyObject *
7632PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7633{
7634 PyObject* str_obj;
7635 PyObject* sep_obj;
7636 PyObject* out;
7637
7638 str_obj = PyUnicode_FromObject(str_in);
7639 if (!str_obj)
7640 return NULL;
7641 sep_obj = PyUnicode_FromObject(sep_in);
7642 if (!sep_obj) {
7643 Py_DECREF(str_obj);
7644 return NULL;
7645 }
7646
7647 out = stringlib_partition(
7648 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7649 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7650 );
7651
7652 Py_DECREF(sep_obj);
7653 Py_DECREF(str_obj);
7654
7655 return out;
7656}
7657
7658
7659PyObject *
7660PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7661{
7662 PyObject* str_obj;
7663 PyObject* sep_obj;
7664 PyObject* out;
7665
7666 str_obj = PyUnicode_FromObject(str_in);
7667 if (!str_obj)
7668 return NULL;
7669 sep_obj = PyUnicode_FromObject(sep_in);
7670 if (!sep_obj) {
7671 Py_DECREF(str_obj);
7672 return NULL;
7673 }
7674
7675 out = stringlib_rpartition(
7676 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7677 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7678 );
7679
7680 Py_DECREF(sep_obj);
7681 Py_DECREF(str_obj);
7682
7683 return out;
7684}
7685
7686PyDoc_STRVAR(partition__doc__,
7687"S.partition(sep) -> (head, sep, tail)\n\
7688\n\
7689Searches for the separator sep in S, and returns the part before it,\n\
7690the separator itself, and the part after it. If the separator is not\n\
7691found, returns S and two empty strings.");
7692
7693static PyObject*
7694unicode_partition(PyUnicodeObject *self, PyObject *separator)
7695{
7696 return PyUnicode_Partition((PyObject *)self, separator);
7697}
7698
7699PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007700"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007701\n\
7702Searches for the separator sep in S, starting at the end of S, and returns\n\
7703the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007704separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007705
7706static PyObject*
7707unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7708{
7709 return PyUnicode_RPartition((PyObject *)self, separator);
7710}
7711
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007712PyObject *PyUnicode_RSplit(PyObject *s,
7713 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007714 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007715{
7716 PyObject *result;
7717
7718 s = PyUnicode_FromObject(s);
7719 if (s == NULL)
7720 return NULL;
7721 if (sep != NULL) {
7722 sep = PyUnicode_FromObject(sep);
7723 if (sep == NULL) {
7724 Py_DECREF(s);
7725 return NULL;
7726 }
7727 }
7728
7729 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7730
7731 Py_DECREF(s);
7732 Py_XDECREF(sep);
7733 return result;
7734}
7735
7736PyDoc_STRVAR(rsplit__doc__,
7737"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7738\n\
7739Return a list of the words in S, using sep as the\n\
7740delimiter string, starting at the end of the string and\n\
7741working to the front. If maxsplit is given, at most maxsplit\n\
7742splits are done. If sep is not specified, any whitespace string\n\
7743is a separator.");
7744
7745static PyObject*
7746unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7747{
7748 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007749 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007750
Martin v. Löwis18e16552006-02-15 17:27:45 +00007751 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007752 return NULL;
7753
7754 if (substring == Py_None)
7755 return rsplit(self, NULL, maxcount);
7756 else if (PyUnicode_Check(substring))
7757 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7758 else
7759 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7760}
7761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007762PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007763"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764\n\
7765Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007766Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007767is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768
7769static PyObject*
7770unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7771{
Guido van Rossum86662912000-04-11 15:38:46 +00007772 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773
Guido van Rossum86662912000-04-11 15:38:46 +00007774 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775 return NULL;
7776
Guido van Rossum86662912000-04-11 15:38:46 +00007777 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778}
7779
7780static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007781PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782{
Walter Dörwald346737f2007-05-31 10:44:43 +00007783 if (PyUnicode_CheckExact(self)) {
7784 Py_INCREF(self);
7785 return self;
7786 } else
7787 /* Subtype -- return genuine unicode string with the same value. */
7788 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7789 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790}
7791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007792PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793"S.swapcase() -> unicode\n\
7794\n\
7795Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007796and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797
7798static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007799unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 return fixup(self, fixswapcase);
7802}
7803
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007804PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805"S.translate(table) -> unicode\n\
7806\n\
7807Return a copy of the string S, where all characters have been mapped\n\
7808through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007809Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7810Unmapped characters are left untouched. Characters mapped to None\n\
7811are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812
7813static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007814unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815{
Georg Brandl94c2c752007-10-23 06:52:59 +00007816 PyObject *newtable = NULL;
7817 Py_ssize_t i = 0;
7818 PyObject *key, *value, *result;
7819
7820 if (!PyDict_Check(table)) {
7821 PyErr_SetString(PyExc_TypeError, "translate argument must be a dict");
7822 return NULL;
7823 }
7824 /* fixup the table -- allow size-1 string keys instead of only int keys */
7825 newtable = PyDict_Copy(table);
7826 if (!newtable) return NULL;
7827 while (PyDict_Next(table, &i, &key, &value)) {
7828 if (PyUnicode_Check(key)) {
7829 /* convert string keys to integer keys */
7830 PyObject *newkey;
7831 int res;
7832 if (PyUnicode_GET_SIZE(key) != 1) {
7833 PyErr_SetString(PyExc_ValueError, "string items in translate "
7834 "table must be 1 element long");
7835 goto err;
7836 }
7837 newkey = PyInt_FromLong(PyUnicode_AS_UNICODE(key)[0]);
7838 if (!newkey)
7839 goto err;
7840 res = PyDict_SetItem(newtable, newkey, value);
7841 Py_DECREF(newkey);
7842 if (res < 0)
7843 goto err;
7844 } else if (PyInt_Check(key)) {
7845 /* just keep integer keys */
7846 if (PyDict_SetItem(newtable, key, value) < 0)
7847 goto err;
7848 } else {
7849 PyErr_SetString(PyExc_TypeError, "items in translate table must be "
7850 "strings or integers");
7851 goto err;
7852 }
7853 }
7854
7855 result = PyUnicode_TranslateCharmap(self->str,
7856 self->length,
7857 newtable,
7858 "ignore");
7859 Py_DECREF(newtable);
7860 return result;
7861 err:
7862 Py_DECREF(newtable);
7863 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864}
7865
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007866PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867"S.upper() -> unicode\n\
7868\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007869Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870
7871static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007872unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 return fixup(self, fixupper);
7875}
7876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007877PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878"S.zfill(width) -> unicode\n\
7879\n\
7880Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007881of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882
7883static PyObject *
7884unicode_zfill(PyUnicodeObject *self, PyObject *args)
7885{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007886 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 PyUnicodeObject *u;
7888
Martin v. Löwis18e16552006-02-15 17:27:45 +00007889 Py_ssize_t width;
7890 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891 return NULL;
7892
7893 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007894 if (PyUnicode_CheckExact(self)) {
7895 Py_INCREF(self);
7896 return (PyObject*) self;
7897 }
7898 else
7899 return PyUnicode_FromUnicode(
7900 PyUnicode_AS_UNICODE(self),
7901 PyUnicode_GET_SIZE(self)
7902 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903 }
7904
7905 fill = width - self->length;
7906
7907 u = pad(self, fill, 0, '0');
7908
Walter Dörwald068325e2002-04-15 13:36:47 +00007909 if (u == NULL)
7910 return NULL;
7911
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912 if (u->str[fill] == '+' || u->str[fill] == '-') {
7913 /* move sign to beginning of string */
7914 u->str[0] = u->str[fill];
7915 u->str[fill] = '0';
7916 }
7917
7918 return (PyObject*) u;
7919}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920
7921#if 0
7922static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007923unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925 return PyInt_FromLong(unicode_freelist_size);
7926}
7927#endif
7928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007929PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007930"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007931\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007932Return True if S starts with the specified prefix, False otherwise.\n\
7933With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007934With optional end, stop comparing S at that position.\n\
7935prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936
7937static PyObject *
7938unicode_startswith(PyUnicodeObject *self,
7939 PyObject *args)
7940{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007941 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007943 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007944 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007945 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007947 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007948 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007950 if (PyTuple_Check(subobj)) {
7951 Py_ssize_t i;
7952 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7953 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7954 PyTuple_GET_ITEM(subobj, i));
7955 if (substring == NULL)
7956 return NULL;
7957 result = tailmatch(self, substring, start, end, -1);
7958 Py_DECREF(substring);
7959 if (result) {
7960 Py_RETURN_TRUE;
7961 }
7962 }
7963 /* nothing matched */
7964 Py_RETURN_FALSE;
7965 }
7966 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007968 return NULL;
7969 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007971 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972}
7973
7974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007975PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007976"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007978Return True if S ends with the specified suffix, False otherwise.\n\
7979With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007980With optional end, stop comparing S at that position.\n\
7981suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982
7983static PyObject *
7984unicode_endswith(PyUnicodeObject *self,
7985 PyObject *args)
7986{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007987 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007989 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007990 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007991 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007993 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7994 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007996 if (PyTuple_Check(subobj)) {
7997 Py_ssize_t i;
7998 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7999 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8000 PyTuple_GET_ITEM(subobj, i));
8001 if (substring == NULL)
8002 return NULL;
8003 result = tailmatch(self, substring, start, end, +1);
8004 Py_DECREF(substring);
8005 if (result) {
8006 Py_RETURN_TRUE;
8007 }
8008 }
8009 Py_RETURN_FALSE;
8010 }
8011 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008013 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008015 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008017 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018}
8019
Eric Smith8c663262007-08-25 02:26:07 +00008020#include "stringlib/string_format.h"
8021
8022PyDoc_STRVAR(format__doc__,
8023"S.format(*args, **kwargs) -> unicode\n\
8024\n\
8025");
8026
Eric Smith8c663262007-08-25 02:26:07 +00008027PyDoc_STRVAR(p_format__doc__,
8028"S.__format__(format_spec) -> unicode\n\
8029\n\
8030");
8031
8032static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008033unicode_getnewargs(PyUnicodeObject *v)
8034{
8035 return Py_BuildValue("(u#)", v->str, v->length);
8036}
8037
8038
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039static PyMethodDef unicode_methods[] = {
8040
8041 /* Order is according to common usage: often used methods should
8042 appear first, since lookup is done sequentially. */
8043
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008044 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8045 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8046 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008047 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008048 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8049 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8050 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8051 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8052 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8053 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8054 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008055 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008056 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8057 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8058 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008059 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008060 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8061 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8062 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008063 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008064 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008065 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008066 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008067 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8068 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8069 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8070 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8071 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8072 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8073 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8074 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8075 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8076 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8077 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8078 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8079 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8080 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008081 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008082 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008083 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8084 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008085 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8086 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00008087#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008088 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089#endif
8090
8091#if 0
8092 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008093 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094#endif
8095
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008096 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 {NULL, NULL}
8098};
8099
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008100static PyObject *
8101unicode_mod(PyObject *v, PyObject *w)
8102{
8103 if (!PyUnicode_Check(v)) {
8104 Py_INCREF(Py_NotImplemented);
8105 return Py_NotImplemented;
8106 }
8107 return PyUnicode_Format(v, w);
8108}
8109
8110static PyNumberMethods unicode_as_number = {
8111 0, /*nb_add*/
8112 0, /*nb_subtract*/
8113 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008114 unicode_mod, /*nb_remainder*/
8115};
8116
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008118 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008119 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008120 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8121 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008122 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 0, /* sq_ass_item */
8124 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008125 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126};
8127
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008128static PyObject*
8129unicode_subscript(PyUnicodeObject* self, PyObject* item)
8130{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008131 if (PyIndex_Check(item)) {
8132 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008133 if (i == -1 && PyErr_Occurred())
8134 return NULL;
8135 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008136 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008137 return unicode_getitem(self, i);
8138 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008139 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008140 Py_UNICODE* source_buf;
8141 Py_UNICODE* result_buf;
8142 PyObject* result;
8143
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008144 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008145 &start, &stop, &step, &slicelength) < 0) {
8146 return NULL;
8147 }
8148
8149 if (slicelength <= 0) {
8150 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008151 } else if (start == 0 && step == 1 && slicelength == self->length &&
8152 PyUnicode_CheckExact(self)) {
8153 Py_INCREF(self);
8154 return (PyObject *)self;
8155 } else if (step == 1) {
8156 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008157 } else {
8158 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008159 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8160 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008161
8162 if (result_buf == NULL)
8163 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008164
8165 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8166 result_buf[i] = source_buf[cur];
8167 }
Tim Petersced69f82003-09-16 20:30:58 +00008168
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008169 result = PyUnicode_FromUnicode(result_buf, slicelength);
8170 PyMem_FREE(result_buf);
8171 return result;
8172 }
8173 } else {
8174 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8175 return NULL;
8176 }
8177}
8178
8179static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008180 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008181 (binaryfunc)unicode_subscript, /* mp_subscript */
8182 (objobjargproc)0, /* mp_ass_subscript */
8183};
8184
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186/* Helpers for PyUnicode_Format() */
8187
8188static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008189getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008191 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192 if (argidx < arglen) {
8193 (*p_argidx)++;
8194 if (arglen < 0)
8195 return args;
8196 else
8197 return PyTuple_GetItem(args, argidx);
8198 }
8199 PyErr_SetString(PyExc_TypeError,
8200 "not enough arguments for format string");
8201 return NULL;
8202}
8203
Martin v. Löwis18e16552006-02-15 17:27:45 +00008204static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008205strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008207 register Py_ssize_t i;
8208 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209 for (i = len - 1; i >= 0; i--)
8210 buffer[i] = (Py_UNICODE) charbuffer[i];
8211
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212 return len;
8213}
8214
Neal Norwitzfc76d632006-01-10 06:03:13 +00008215static int
8216doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8217{
Tim Peters15231542006-02-16 01:08:01 +00008218 Py_ssize_t result;
8219
Neal Norwitzfc76d632006-01-10 06:03:13 +00008220 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008221 result = strtounicode(buffer, (char *)buffer);
8222 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008223}
8224
8225static int
8226longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8227{
Tim Peters15231542006-02-16 01:08:01 +00008228 Py_ssize_t result;
8229
Neal Norwitzfc76d632006-01-10 06:03:13 +00008230 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008231 result = strtounicode(buffer, (char *)buffer);
8232 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008233}
8234
Guido van Rossum078151d2002-08-11 04:24:12 +00008235/* XXX To save some code duplication, formatfloat/long/int could have been
8236 shared with stringobject.c, converting from 8-bit to Unicode after the
8237 formatting is done. */
8238
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239static int
8240formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008241 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 int flags,
8243 int prec,
8244 int type,
8245 PyObject *v)
8246{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008247 /* fmt = '%#.' + `prec` + `type`
8248 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249 char fmt[20];
8250 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008251
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 x = PyFloat_AsDouble(v);
8253 if (x == -1.0 && PyErr_Occurred())
8254 return -1;
8255 if (prec < 0)
8256 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8258 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008259 /* Worst case length calc to ensure no buffer overrun:
8260
8261 'g' formats:
8262 fmt = %#.<prec>g
8263 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8264 for any double rep.)
8265 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8266
8267 'f' formats:
8268 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8269 len = 1 + 50 + 1 + prec = 52 + prec
8270
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008271 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008272 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008273
8274 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008275 if (((type == 'g' || type == 'G') &&
8276 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008277 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008278 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008279 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008280 return -1;
8281 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008282 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8283 (flags&F_ALT) ? "#" : "",
8284 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008285 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286}
8287
Tim Peters38fd5b62000-09-21 05:43:11 +00008288static PyObject*
8289formatlong(PyObject *val, int flags, int prec, int type)
8290{
8291 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008292 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008293 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008294 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008295
8296 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8297 if (!str)
8298 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008299 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008300 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008301 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008302}
8303
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304static int
8305formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008306 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 int flags,
8308 int prec,
8309 int type,
8310 PyObject *v)
8311{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008312 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008313 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8314 * + 1 + 1
8315 * = 24
8316 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008317 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008318 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 long x;
8320
8321 x = PyInt_AsLong(v);
8322 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008323 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008324 if (x < 0 && type == 'u') {
8325 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008326 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008327 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8328 sign = "-";
8329 else
8330 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008332 prec = 1;
8333
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008334 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8335 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008336 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008337 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008338 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008339 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008340 return -1;
8341 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008342
8343 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008344 (type == 'x' || type == 'X' || type == 'o')) {
8345 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008346 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008347 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008348 * - when 0 is being converted, the C standard leaves off
8349 * the '0x' or '0X', which is inconsistent with other
8350 * %#x/%#X conversions and inconsistent with Python's
8351 * hex() function
8352 * - there are platforms that violate the standard and
8353 * convert 0 with the '0x' or '0X'
8354 * (Metrowerks, Compaq Tru64)
8355 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008356 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008357 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008358 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008359 * We can achieve the desired consistency by inserting our
8360 * own '0x' or '0X' prefix, and substituting %x/%X in place
8361 * of %#x/%#X.
8362 *
8363 * Note that this is the same approach as used in
8364 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008365 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008366 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8367 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008368 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008369 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008370 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8371 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008372 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008373 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008374 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008375 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008376 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008377 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378}
8379
8380static int
8381formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008382 size_t buflen,
8383 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008385 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008386 if (PyUnicode_Check(v)) {
8387 if (PyUnicode_GET_SIZE(v) != 1)
8388 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008390 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008392 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008393 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008394 goto onError;
8395 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8396 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397
8398 else {
8399 /* Integer input truncated to a character */
8400 long x;
8401 x = PyInt_AsLong(v);
8402 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008403 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008404#ifdef Py_UNICODE_WIDE
8405 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008406 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008407 "%c arg not in range(0x110000) "
8408 "(wide Python build)");
8409 return -1;
8410 }
8411#else
8412 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008413 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008414 "%c arg not in range(0x10000) "
8415 "(narrow Python build)");
8416 return -1;
8417 }
8418#endif
8419 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420 }
8421 buf[1] = '\0';
8422 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008423
8424 onError:
8425 PyErr_SetString(PyExc_TypeError,
8426 "%c requires int or char");
8427 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428}
8429
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008430/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8431
8432 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8433 chars are formatted. XXX This is a magic number. Each formatting
8434 routine does bounds checking to ensure no overflow, but a better
8435 solution may be to malloc a buffer of appropriate size for each
8436 format. For now, the current solution is sufficient.
8437*/
8438#define FORMATBUFLEN (size_t)120
8439
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440PyObject *PyUnicode_Format(PyObject *format,
8441 PyObject *args)
8442{
8443 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008444 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445 int args_owned = 0;
8446 PyUnicodeObject *result = NULL;
8447 PyObject *dict = NULL;
8448 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008449
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450 if (format == NULL || args == NULL) {
8451 PyErr_BadInternalCall();
8452 return NULL;
8453 }
8454 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008455 if (uformat == NULL)
8456 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457 fmt = PyUnicode_AS_UNICODE(uformat);
8458 fmtcnt = PyUnicode_GET_SIZE(uformat);
8459
8460 reslen = rescnt = fmtcnt + 100;
8461 result = _PyUnicode_New(reslen);
8462 if (result == NULL)
8463 goto onError;
8464 res = PyUnicode_AS_UNICODE(result);
8465
8466 if (PyTuple_Check(args)) {
8467 arglen = PyTuple_Size(args);
8468 argidx = 0;
8469 }
8470 else {
8471 arglen = -1;
8472 argidx = -2;
8473 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008474 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Guido van Rossum3172c5d2007-10-16 18:12:55 +00008475 !PyString_Check(args) && !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476 dict = args;
8477
8478 while (--fmtcnt >= 0) {
8479 if (*fmt != '%') {
8480 if (--rescnt < 0) {
8481 rescnt = fmtcnt + 100;
8482 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008483 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008484 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8486 --rescnt;
8487 }
8488 *res++ = *fmt++;
8489 }
8490 else {
8491 /* Got a format specifier */
8492 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008493 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495 Py_UNICODE c = '\0';
8496 Py_UNICODE fill;
8497 PyObject *v = NULL;
8498 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008499 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008501 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008502 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008503
8504 fmt++;
8505 if (*fmt == '(') {
8506 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008507 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508 PyObject *key;
8509 int pcount = 1;
8510
8511 if (dict == NULL) {
8512 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008513 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514 goto onError;
8515 }
8516 ++fmt;
8517 --fmtcnt;
8518 keystart = fmt;
8519 /* Skip over balanced parentheses */
8520 while (pcount > 0 && --fmtcnt >= 0) {
8521 if (*fmt == ')')
8522 --pcount;
8523 else if (*fmt == '(')
8524 ++pcount;
8525 fmt++;
8526 }
8527 keylen = fmt - keystart - 1;
8528 if (fmtcnt < 0 || pcount > 0) {
8529 PyErr_SetString(PyExc_ValueError,
8530 "incomplete format key");
8531 goto onError;
8532 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008533#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008534 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535 then looked up since Python uses strings to hold
8536 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008537 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 key = PyUnicode_EncodeUTF8(keystart,
8539 keylen,
8540 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008541#else
8542 key = PyUnicode_FromUnicode(keystart, keylen);
8543#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544 if (key == NULL)
8545 goto onError;
8546 if (args_owned) {
8547 Py_DECREF(args);
8548 args_owned = 0;
8549 }
8550 args = PyObject_GetItem(dict, key);
8551 Py_DECREF(key);
8552 if (args == NULL) {
8553 goto onError;
8554 }
8555 args_owned = 1;
8556 arglen = -1;
8557 argidx = -2;
8558 }
8559 while (--fmtcnt >= 0) {
8560 switch (c = *fmt++) {
8561 case '-': flags |= F_LJUST; continue;
8562 case '+': flags |= F_SIGN; continue;
8563 case ' ': flags |= F_BLANK; continue;
8564 case '#': flags |= F_ALT; continue;
8565 case '0': flags |= F_ZERO; continue;
8566 }
8567 break;
8568 }
8569 if (c == '*') {
8570 v = getnextarg(args, arglen, &argidx);
8571 if (v == NULL)
8572 goto onError;
8573 if (!PyInt_Check(v)) {
8574 PyErr_SetString(PyExc_TypeError,
8575 "* wants int");
8576 goto onError;
8577 }
8578 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008579 if (width == -1 && PyErr_Occurred())
8580 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581 if (width < 0) {
8582 flags |= F_LJUST;
8583 width = -width;
8584 }
8585 if (--fmtcnt >= 0)
8586 c = *fmt++;
8587 }
8588 else if (c >= '0' && c <= '9') {
8589 width = c - '0';
8590 while (--fmtcnt >= 0) {
8591 c = *fmt++;
8592 if (c < '0' || c > '9')
8593 break;
8594 if ((width*10) / 10 != width) {
8595 PyErr_SetString(PyExc_ValueError,
8596 "width too big");
8597 goto onError;
8598 }
8599 width = width*10 + (c - '0');
8600 }
8601 }
8602 if (c == '.') {
8603 prec = 0;
8604 if (--fmtcnt >= 0)
8605 c = *fmt++;
8606 if (c == '*') {
8607 v = getnextarg(args, arglen, &argidx);
8608 if (v == NULL)
8609 goto onError;
8610 if (!PyInt_Check(v)) {
8611 PyErr_SetString(PyExc_TypeError,
8612 "* wants int");
8613 goto onError;
8614 }
8615 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008616 if (prec == -1 && PyErr_Occurred())
8617 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 if (prec < 0)
8619 prec = 0;
8620 if (--fmtcnt >= 0)
8621 c = *fmt++;
8622 }
8623 else if (c >= '0' && c <= '9') {
8624 prec = c - '0';
8625 while (--fmtcnt >= 0) {
8626 c = Py_CHARMASK(*fmt++);
8627 if (c < '0' || c > '9')
8628 break;
8629 if ((prec*10) / 10 != prec) {
8630 PyErr_SetString(PyExc_ValueError,
8631 "prec too big");
8632 goto onError;
8633 }
8634 prec = prec*10 + (c - '0');
8635 }
8636 }
8637 } /* prec */
8638 if (fmtcnt >= 0) {
8639 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640 if (--fmtcnt >= 0)
8641 c = *fmt++;
8642 }
8643 }
8644 if (fmtcnt < 0) {
8645 PyErr_SetString(PyExc_ValueError,
8646 "incomplete format");
8647 goto onError;
8648 }
8649 if (c != '%') {
8650 v = getnextarg(args, arglen, &argidx);
8651 if (v == NULL)
8652 goto onError;
8653 }
8654 sign = 0;
8655 fill = ' ';
8656 switch (c) {
8657
8658 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008659 pbuf = formatbuf;
8660 /* presume that buffer length is at least 1 */
8661 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 len = 1;
8663 break;
8664
8665 case 's':
8666 case 'r':
8667 if (PyUnicode_Check(v) && c == 's') {
8668 temp = v;
8669 Py_INCREF(temp);
8670 }
8671 else {
8672 PyObject *unicode;
8673 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008674 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 else
8676 temp = PyObject_Repr(v);
8677 if (temp == NULL)
8678 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008679 if (PyUnicode_Check(temp))
8680 /* nothing to do */;
8681 else if (PyString_Check(temp)) {
8682 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008683 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008685 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008687 Py_DECREF(temp);
8688 temp = unicode;
8689 if (temp == NULL)
8690 goto onError;
8691 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008692 else {
8693 Py_DECREF(temp);
8694 PyErr_SetString(PyExc_TypeError,
8695 "%s argument has non-string str()");
8696 goto onError;
8697 }
8698 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008699 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 len = PyUnicode_GET_SIZE(temp);
8701 if (prec >= 0 && len > prec)
8702 len = prec;
8703 break;
8704
8705 case 'i':
8706 case 'd':
8707 case 'u':
8708 case 'o':
8709 case 'x':
8710 case 'X':
8711 if (c == 'i')
8712 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008713 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008714 temp = formatlong(v, flags, prec, c);
8715 if (!temp)
8716 goto onError;
8717 pbuf = PyUnicode_AS_UNICODE(temp);
8718 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008719 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008721 else {
8722 pbuf = formatbuf;
8723 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8724 flags, prec, c, v);
8725 if (len < 0)
8726 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008727 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008728 }
8729 if (flags & F_ZERO)
8730 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 break;
8732
8733 case 'e':
8734 case 'E':
8735 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008736 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737 case 'g':
8738 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008739 if (c == 'F')
8740 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008741 pbuf = formatbuf;
8742 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8743 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744 if (len < 0)
8745 goto onError;
8746 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008747 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 fill = '0';
8749 break;
8750
8751 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008752 pbuf = formatbuf;
8753 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 if (len < 0)
8755 goto onError;
8756 break;
8757
8758 default:
8759 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008760 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008761 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008762 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008763 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008764 (Py_ssize_t)(fmt - 1 -
8765 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766 goto onError;
8767 }
8768 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008769 if (*pbuf == '-' || *pbuf == '+') {
8770 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771 len--;
8772 }
8773 else if (flags & F_SIGN)
8774 sign = '+';
8775 else if (flags & F_BLANK)
8776 sign = ' ';
8777 else
8778 sign = 0;
8779 }
8780 if (width < len)
8781 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008782 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008783 reslen -= rescnt;
8784 rescnt = width + fmtcnt + 100;
8785 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008786 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008787 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008788 PyErr_NoMemory();
8789 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008790 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008791 if (_PyUnicode_Resize(&result, reslen) < 0) {
8792 Py_XDECREF(temp);
8793 goto onError;
8794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795 res = PyUnicode_AS_UNICODE(result)
8796 + reslen - rescnt;
8797 }
8798 if (sign) {
8799 if (fill != ' ')
8800 *res++ = sign;
8801 rescnt--;
8802 if (width > len)
8803 width--;
8804 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008805 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008806 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008807 assert(pbuf[1] == c);
8808 if (fill != ' ') {
8809 *res++ = *pbuf++;
8810 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008811 }
Tim Petersfff53252001-04-12 18:38:48 +00008812 rescnt -= 2;
8813 width -= 2;
8814 if (width < 0)
8815 width = 0;
8816 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008817 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818 if (width > len && !(flags & F_LJUST)) {
8819 do {
8820 --rescnt;
8821 *res++ = fill;
8822 } while (--width > len);
8823 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008824 if (fill == ' ') {
8825 if (sign)
8826 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008827 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008828 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008829 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008830 *res++ = *pbuf++;
8831 *res++ = *pbuf++;
8832 }
8833 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008834 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835 res += len;
8836 rescnt -= len;
8837 while (--width >= len) {
8838 --rescnt;
8839 *res++ = ' ';
8840 }
8841 if (dict && (argidx < arglen) && c != '%') {
8842 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008843 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008844 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008845 goto onError;
8846 }
8847 Py_XDECREF(temp);
8848 } /* '%' */
8849 } /* until end */
8850 if (argidx < arglen && !dict) {
8851 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008852 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853 goto onError;
8854 }
8855
Thomas Woutersa96affe2006-03-12 00:29:36 +00008856 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8857 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858 if (args_owned) {
8859 Py_DECREF(args);
8860 }
8861 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 return (PyObject *)result;
8863
8864 onError:
8865 Py_XDECREF(result);
8866 Py_DECREF(uformat);
8867 if (args_owned) {
8868 Py_DECREF(args);
8869 }
8870 return NULL;
8871}
8872
Jeremy Hylton938ace62002-07-17 16:30:39 +00008873static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008874unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8875
Tim Peters6d6c1a32001-08-02 04:15:00 +00008876static PyObject *
8877unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8878{
8879 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008880 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008881 char *encoding = NULL;
8882 char *errors = NULL;
8883
Guido van Rossume023fe02001-08-30 03:12:59 +00008884 if (type != &PyUnicode_Type)
8885 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008886 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8887 kwlist, &x, &encoding, &errors))
8888 return NULL;
8889 if (x == NULL)
8890 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008891 if (encoding == NULL && errors == NULL)
8892 return PyObject_Unicode(x);
8893 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008894 return PyUnicode_FromEncodedObject(x, encoding, errors);
8895}
8896
Guido van Rossume023fe02001-08-30 03:12:59 +00008897static PyObject *
8898unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8899{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008900 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008901 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008902
8903 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8904 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8905 if (tmp == NULL)
8906 return NULL;
8907 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008908 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008909 if (pnew == NULL) {
8910 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008911 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008912 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008913 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8914 if (pnew->str == NULL) {
8915 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008916 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008917 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008918 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008919 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008920 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8921 pnew->length = n;
8922 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008923 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008924 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008925}
8926
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008927PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008928"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008929\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008930Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008931encoding defaults to the current default string encoding.\n\
8932errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008933
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008934static PyObject *unicode_iter(PyObject *seq);
8935
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008937 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008938 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 sizeof(PyUnicodeObject), /* tp_size */
8940 0, /* tp_itemsize */
8941 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008942 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008944 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008946 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008947 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008948 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008950 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951 (hashfunc) unicode_hash, /* tp_hash*/
8952 0, /* tp_call*/
8953 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008954 PyObject_GenericGetAttr, /* tp_getattro */
8955 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00008956 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008957 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8958 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008959 unicode_doc, /* tp_doc */
8960 0, /* tp_traverse */
8961 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008962 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008963 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008964 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008965 0, /* tp_iternext */
8966 unicode_methods, /* tp_methods */
8967 0, /* tp_members */
8968 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00008969 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008970 0, /* tp_dict */
8971 0, /* tp_descr_get */
8972 0, /* tp_descr_set */
8973 0, /* tp_dictoffset */
8974 0, /* tp_init */
8975 0, /* tp_alloc */
8976 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008977 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978};
8979
8980/* Initialize the Unicode implementation */
8981
Thomas Wouters78890102000-07-22 19:25:51 +00008982void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008984 int i;
8985
Thomas Wouters477c8d52006-05-27 19:21:47 +00008986 /* XXX - move this array to unicodectype.c ? */
8987 Py_UNICODE linebreak[] = {
8988 0x000A, /* LINE FEED */
8989 0x000D, /* CARRIAGE RETURN */
8990 0x001C, /* FILE SEPARATOR */
8991 0x001D, /* GROUP SEPARATOR */
8992 0x001E, /* RECORD SEPARATOR */
8993 0x0085, /* NEXT LINE */
8994 0x2028, /* LINE SEPARATOR */
8995 0x2029, /* PARAGRAPH SEPARATOR */
8996 };
8997
Fred Drakee4315f52000-05-09 19:53:39 +00008998 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008999 unicode_freelist = NULL;
9000 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009002 if (!unicode_empty)
9003 return;
9004
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009005 for (i = 0; i < 256; i++)
9006 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009007 if (PyType_Ready(&PyUnicode_Type) < 0)
9008 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009009
9010 /* initialize the linebreak bloom filter */
9011 bloom_linebreak = make_bloom_mask(
9012 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9013 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009014
9015 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016}
9017
9018/* Finalize the Unicode implementation */
9019
9020void
Thomas Wouters78890102000-07-22 19:25:51 +00009021_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009023 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009024 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009026 Py_XDECREF(unicode_empty);
9027 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009028
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009029 for (i = 0; i < 256; i++) {
9030 if (unicode_latin1[i]) {
9031 Py_DECREF(unicode_latin1[i]);
9032 unicode_latin1[i] = NULL;
9033 }
9034 }
9035
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009036 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037 PyUnicodeObject *v = u;
9038 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00009039 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00009040 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00009041 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009042 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009044 unicode_freelist = NULL;
9045 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009047
Walter Dörwald16807132007-05-25 13:52:07 +00009048void
9049PyUnicode_InternInPlace(PyObject **p)
9050{
9051 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9052 PyObject *t;
9053 if (s == NULL || !PyUnicode_Check(s))
9054 Py_FatalError(
9055 "PyUnicode_InternInPlace: unicode strings only please!");
9056 /* If it's a subclass, we don't really know what putting
9057 it in the interned dict might do. */
9058 if (!PyUnicode_CheckExact(s))
9059 return;
9060 if (PyUnicode_CHECK_INTERNED(s))
9061 return;
9062 if (interned == NULL) {
9063 interned = PyDict_New();
9064 if (interned == NULL) {
9065 PyErr_Clear(); /* Don't leave an exception */
9066 return;
9067 }
9068 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009069 /* It might be that the GetItem call fails even
9070 though the key is present in the dictionary,
9071 namely when this happens during a stack overflow. */
9072 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009073 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009074 Py_END_ALLOW_RECURSION
9075
Walter Dörwald16807132007-05-25 13:52:07 +00009076 if (t) {
9077 Py_INCREF(t);
9078 Py_DECREF(*p);
9079 *p = t;
9080 return;
9081 }
9082
Martin v. Löwis5b222132007-06-10 09:51:05 +00009083 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009084 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9085 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009086 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009087 return;
9088 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009089 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009090 /* The two references in interned are not counted by refcnt.
9091 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009092 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009093 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9094}
9095
9096void
9097PyUnicode_InternImmortal(PyObject **p)
9098{
9099 PyUnicode_InternInPlace(p);
9100 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9101 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9102 Py_INCREF(*p);
9103 }
9104}
9105
9106PyObject *
9107PyUnicode_InternFromString(const char *cp)
9108{
9109 PyObject *s = PyUnicode_FromString(cp);
9110 if (s == NULL)
9111 return NULL;
9112 PyUnicode_InternInPlace(&s);
9113 return s;
9114}
9115
9116void _Py_ReleaseInternedUnicodeStrings(void)
9117{
9118 PyObject *keys;
9119 PyUnicodeObject *s;
9120 Py_ssize_t i, n;
9121 Py_ssize_t immortal_size = 0, mortal_size = 0;
9122
9123 if (interned == NULL || !PyDict_Check(interned))
9124 return;
9125 keys = PyDict_Keys(interned);
9126 if (keys == NULL || !PyList_Check(keys)) {
9127 PyErr_Clear();
9128 return;
9129 }
9130
9131 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9132 detector, interned unicode strings are not forcibly deallocated;
9133 rather, we give them their stolen references back, and then clear
9134 and DECREF the interned dict. */
9135
9136 n = PyList_GET_SIZE(keys);
9137 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9138 n);
9139 for (i = 0; i < n; i++) {
9140 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9141 switch (s->state) {
9142 case SSTATE_NOT_INTERNED:
9143 /* XXX Shouldn't happen */
9144 break;
9145 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009146 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009147 immortal_size += s->length;
9148 break;
9149 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009150 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009151 mortal_size += s->length;
9152 break;
9153 default:
9154 Py_FatalError("Inconsistent interned string state.");
9155 }
9156 s->state = SSTATE_NOT_INTERNED;
9157 }
9158 fprintf(stderr, "total size of all interned strings: "
9159 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9160 "mortal/immortal\n", mortal_size, immortal_size);
9161 Py_DECREF(keys);
9162 PyDict_Clear(interned);
9163 Py_DECREF(interned);
9164 interned = NULL;
9165}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009166
9167
9168/********************* Unicode Iterator **************************/
9169
9170typedef struct {
9171 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009172 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009173 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9174} unicodeiterobject;
9175
9176static void
9177unicodeiter_dealloc(unicodeiterobject *it)
9178{
9179 _PyObject_GC_UNTRACK(it);
9180 Py_XDECREF(it->it_seq);
9181 PyObject_GC_Del(it);
9182}
9183
9184static int
9185unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9186{
9187 Py_VISIT(it->it_seq);
9188 return 0;
9189}
9190
9191static PyObject *
9192unicodeiter_next(unicodeiterobject *it)
9193{
9194 PyUnicodeObject *seq;
9195 PyObject *item;
9196
9197 assert(it != NULL);
9198 seq = it->it_seq;
9199 if (seq == NULL)
9200 return NULL;
9201 assert(PyUnicode_Check(seq));
9202
9203 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009204 item = PyUnicode_FromUnicode(
9205 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009206 if (item != NULL)
9207 ++it->it_index;
9208 return item;
9209 }
9210
9211 Py_DECREF(seq);
9212 it->it_seq = NULL;
9213 return NULL;
9214}
9215
9216static PyObject *
9217unicodeiter_len(unicodeiterobject *it)
9218{
9219 Py_ssize_t len = 0;
9220 if (it->it_seq)
9221 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9222 return PyInt_FromSsize_t(len);
9223}
9224
9225PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9226
9227static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009228 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9229 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009230 {NULL, NULL} /* sentinel */
9231};
9232
9233PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009234 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009235 "unicodeiterator", /* tp_name */
9236 sizeof(unicodeiterobject), /* tp_basicsize */
9237 0, /* tp_itemsize */
9238 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009239 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009240 0, /* tp_print */
9241 0, /* tp_getattr */
9242 0, /* tp_setattr */
9243 0, /* tp_compare */
9244 0, /* tp_repr */
9245 0, /* tp_as_number */
9246 0, /* tp_as_sequence */
9247 0, /* tp_as_mapping */
9248 0, /* tp_hash */
9249 0, /* tp_call */
9250 0, /* tp_str */
9251 PyObject_GenericGetAttr, /* tp_getattro */
9252 0, /* tp_setattro */
9253 0, /* tp_as_buffer */
9254 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9255 0, /* tp_doc */
9256 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9257 0, /* tp_clear */
9258 0, /* tp_richcompare */
9259 0, /* tp_weaklistoffset */
9260 PyObject_SelfIter, /* tp_iter */
9261 (iternextfunc)unicodeiter_next, /* tp_iternext */
9262 unicodeiter_methods, /* tp_methods */
9263 0,
9264};
9265
9266static PyObject *
9267unicode_iter(PyObject *seq)
9268{
9269 unicodeiterobject *it;
9270
9271 if (!PyUnicode_Check(seq)) {
9272 PyErr_BadInternalCall();
9273 return NULL;
9274 }
9275 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9276 if (it == NULL)
9277 return NULL;
9278 it->it_index = 0;
9279 Py_INCREF(seq);
9280 it->it_seq = (PyUnicodeObject *)seq;
9281 _PyObject_GC_TRACK(it);
9282 return (PyObject *)it;
9283}
9284
Martin v. Löwis5b222132007-06-10 09:51:05 +00009285size_t
9286Py_UNICODE_strlen(const Py_UNICODE *u)
9287{
9288 int res = 0;
9289 while(*u++)
9290 res++;
9291 return res;
9292}
9293
9294Py_UNICODE*
9295Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9296{
9297 Py_UNICODE *u = s1;
9298 while ((*u++ = *s2++));
9299 return s1;
9300}
9301
9302Py_UNICODE*
9303Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9304{
9305 Py_UNICODE *u = s1;
9306 while ((*u++ = *s2++))
9307 if (n-- == 0)
9308 break;
9309 return s1;
9310}
9311
9312int
9313Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9314{
9315 while (*s1 && *s2 && *s1 == *s2)
9316 s1++, s2++;
9317 if (*s1 && *s2)
9318 return (*s1 < *s2) ? -1 : +1;
9319 if (*s1)
9320 return 1;
9321 if (*s2)
9322 return -1;
9323 return 0;
9324}
9325
9326Py_UNICODE*
9327Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9328{
9329 const Py_UNICODE *p;
9330 for (p = s; *p; p++)
9331 if (*p == c)
9332 return (Py_UNICODE*)p;
9333 return NULL;
9334}
9335
9336
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009337#ifdef __cplusplus
9338}
9339#endif
9340
9341
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009342/*
9343Local variables:
9344c-basic-offset: 4
9345indent-tabs-mode: nil
9346End:
9347*/