blob: c77751de657030b5fb759048fafeb0d8523060ad [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Eric Smith8c663262007-08-25 02:26:07 +000049#include "formatter_unicode.h"
50
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000051#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000052#include <windows.h>
53#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000054
Guido van Rossumd57fd912000-03-10 22:53:23 +000055/* Limit for the Unicode object free list */
56
57#define MAX_UNICODE_FREELIST_SIZE 1024
58
59/* Limit for the Unicode object free list stay alive optimization.
60
61 The implementation will keep allocated Unicode memory intact for
62 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000063 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Barry Warsaw51ac5802000-03-20 16:36:48 +000065 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000067 malloc()-overhead) bytes of unused garbage.
68
69 Setting the limit to 0 effectively turns the feature off.
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071 Note: This is an experimental feature ! If you get core dumps when
72 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000073
74*/
75
Guido van Rossumfd4b9572000-04-10 13:51:10 +000076#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
78/* Endianness switches; defaults to little endian */
79
80#ifdef WORDS_BIGENDIAN
81# define BYTEORDER_IS_BIG_ENDIAN
82#else
83# define BYTEORDER_IS_LITTLE_ENDIAN
84#endif
85
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086/* --- Globals ------------------------------------------------------------
87
88 The globals are initialized by the _PyUnicode_Init() API and should
89 not be used before calling that API.
90
91*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093
94#ifdef __cplusplus
95extern "C" {
96#endif
97
Walter Dörwald16807132007-05-25 13:52:07 +000098/* This dictionary holds all interned unicode strings. Note that references
99 to strings in this dictionary are *not* counted in the string's ob_refcnt.
100 When the interned string reaches a refcnt of 0 the string deallocation
101 function will delete the reference from this dictionary.
102
103 Another way to look at this is that to say that the actual reference
104 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
105*/
106static PyObject *interned;
107
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000109static PyUnicodeObject *unicode_freelist;
110static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000112/* The empty Unicode object is shared to improve performance. */
113static PyUnicodeObject *unicode_empty;
114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117static PyUnicodeObject *unicode_latin1[256];
118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000120 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000121 PyUnicode_GetDefaultEncoding() API to access this global.
122
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000123 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000124 hard coded default!
125*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000126static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000128Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000129PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000130{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000131#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000132 return 0x10FFFF;
133#else
134 /* This is actually an illegal character, so it should
135 not be passed to unichr. */
136 return 0xFFFF;
137#endif
138}
139
Thomas Wouters477c8d52006-05-27 19:21:47 +0000140/* --- Bloom Filters ----------------------------------------------------- */
141
142/* stuff to implement simple "bloom filters" for Unicode characters.
143 to keep things simple, we use a single bitmask, using the least 5
144 bits from each unicode characters as the bit index. */
145
146/* the linebreak mask is set up by Unicode_Init below */
147
148#define BLOOM_MASK unsigned long
149
150static BLOOM_MASK bloom_linebreak;
151
152#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
153
154#define BLOOM_LINEBREAK(ch)\
155 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
156
157Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
158{
159 /* calculate simple bloom-style bitmask for a given unicode string */
160
161 long mask;
162 Py_ssize_t i;
163
164 mask = 0;
165 for (i = 0; i < len; i++)
166 mask |= (1 << (ptr[i] & 0x1F));
167
168 return mask;
169}
170
171Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
172{
173 Py_ssize_t i;
174
175 for (i = 0; i < setlen; i++)
176 if (set[i] == chr)
177 return 1;
178
179 return 0;
180}
181
182#define BLOOM_MEMBER(mask, chr, set, setlen)\
183 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
184
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185/* --- Unicode Object ----------------------------------------------------- */
186
187static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000189 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190{
191 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000192
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000193 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 /* Resizing shared object (unicode_empty or single character
198 objects) in-place is not allowed. Use PyUnicode_Resize()
199 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 if (unicode == unicode_empty ||
202 (unicode->length == 1 &&
203 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000206 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 return -1;
208 }
209
Thomas Wouters477c8d52006-05-27 19:21:47 +0000210 /* We allocate one more byte to make sure the string is Ux0000 terminated.
211 The overallocation is also used by fastsearch, which assumes that it's
212 safe to look at str[length] (without making any assumptions about what
213 it contains). */
214
Guido van Rossumd57fd912000-03-10 22:53:23 +0000215 oldstr = unicode->str;
216 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
217 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 PyErr_NoMemory();
220 return -1;
221 }
222 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000223 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000225 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000227 if (unicode->defenc) {
228 Py_DECREF(unicode->defenc);
229 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 }
231 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000232
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 return 0;
234}
235
236/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000237 Ux0000 terminated; some code (e.g. new_identifier)
238 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239
240 XXX This allocator could further be enhanced by assuring that the
241 free list never reduces its size below 1.
242
243*/
244
245static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 register PyUnicodeObject *unicode;
249
Thomas Wouters477c8d52006-05-27 19:21:47 +0000250 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (length == 0 && unicode_empty != NULL) {
252 Py_INCREF(unicode_empty);
253 return unicode_empty;
254 }
255
256 /* Unicode freelist & memory allocation */
257 if (unicode_freelist) {
258 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000259 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Keep-Alive optimization: we only upsize the buffer,
263 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000264 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000265 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000266 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 }
269 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000270 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000272 }
273 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 }
275 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000276 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (unicode == NULL)
278 return NULL;
279 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
280 }
281
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000282 if (!unicode->str) {
283 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000284 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000285 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000286 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000287 * the caller fails before initializing str -- unicode_resize()
288 * reads str[0], and the Keep-Alive optimization can keep memory
289 * allocated for str alive across a call to unicode_dealloc(unicode).
290 * We don't want unicode_resize to read uninitialized memory in
291 * that case.
292 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000293 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000297 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000298 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000300
301 onError:
302 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000303 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305}
306
307static
Guido van Rossum9475a232001-10-05 20:51:39 +0000308void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309{
Walter Dörwald16807132007-05-25 13:52:07 +0000310 switch (PyUnicode_CHECK_INTERNED(unicode)) {
311 case SSTATE_NOT_INTERNED:
312 break;
313
314 case SSTATE_INTERNED_MORTAL:
315 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000316 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000317 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
318 Py_FatalError(
319 "deletion of interned unicode string failed");
320 break;
321
322 case SSTATE_INTERNED_IMMORTAL:
323 Py_FatalError("Immortal interned unicode string died.");
324
325 default:
326 Py_FatalError("Inconsistent interned unicode string state.");
327 }
328
Guido van Rossum604ddf82001-12-06 20:03:56 +0000329 if (PyUnicode_CheckExact(unicode) &&
330 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000331 /* Keep-Alive optimization */
332 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000333 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 unicode->str = NULL;
335 unicode->length = 0;
336 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000337 if (unicode->defenc) {
338 Py_DECREF(unicode->defenc);
339 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000340 }
341 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 *(PyUnicodeObject **)unicode = unicode_freelist;
343 unicode_freelist = unicode;
344 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000347 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000348 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000349 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 }
351}
352
Martin v. Löwis18e16552006-02-15 17:27:45 +0000353int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000354{
355 register PyUnicodeObject *v;
356
357 /* Argument checks */
358 if (unicode == NULL) {
359 PyErr_BadInternalCall();
360 return -1;
361 }
362 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000363 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 PyErr_BadInternalCall();
365 return -1;
366 }
367
368 /* Resizing unicode_empty and single character objects is not
369 possible since these are being shared. We simply return a fresh
370 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000371 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000372 (v == unicode_empty || v->length == 1)) {
373 PyUnicodeObject *w = _PyUnicode_New(length);
374 if (w == NULL)
375 return -1;
376 Py_UNICODE_COPY(w->str, v->str,
377 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000378 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 *unicode = (PyObject *)w;
380 return 0;
381 }
382
383 /* Note that we don't have to modify *unicode for unshared Unicode
384 objects, since we can modify them in-place. */
385 return unicode_resize(v, length);
386}
387
388/* Internal API for use in unicodeobject.c only ! */
389#define _PyUnicode_Resize(unicodevar, length) \
390 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
391
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000393 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394{
395 PyUnicodeObject *unicode;
396
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000397 /* If the Unicode data is known at construction time, we can apply
398 some optimizations which share commonly used objects. */
399 if (u != NULL) {
400
401 /* Optimization for empty strings */
402 if (size == 0 && unicode_empty != NULL) {
403 Py_INCREF(unicode_empty);
404 return (PyObject *)unicode_empty;
405 }
406
407 /* Single character Unicode objects in the Latin-1 range are
408 shared when using this constructor */
409 if (size == 1 && *u < 256) {
410 unicode = unicode_latin1[*u];
411 if (!unicode) {
412 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000413 if (!unicode)
414 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000415 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000416 unicode_latin1[*u] = unicode;
417 }
418 Py_INCREF(unicode);
419 return (PyObject *)unicode;
420 }
421 }
Tim Petersced69f82003-09-16 20:30:58 +0000422
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 unicode = _PyUnicode_New(size);
424 if (!unicode)
425 return NULL;
426
427 /* Copy the Unicode data into the new object */
428 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430
431 return (PyObject *)unicode;
432}
433
Walter Dörwaldd2034312007-05-18 16:29:38 +0000434PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000435{
436 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000437 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000438 some optimizations which share commonly used objects.
439 Also, this means the input must be UTF-8, so fall back to the
440 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000441 if (u != NULL) {
442
443 /* Optimization for empty strings */
444 if (size == 0 && unicode_empty != NULL) {
445 Py_INCREF(unicode_empty);
446 return (PyObject *)unicode_empty;
447 }
448
Martin v. Löwis9c121062007-08-05 20:26:11 +0000449 /* Single characters are shared when using this constructor.
450 Restrict to ASCII, since the input must be UTF-8. */
451 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000452 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000453 if (!unicode) {
454 unicode = _PyUnicode_New(1);
455 if (!unicode)
456 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000457 unicode->str[0] = Py_CHARMASK(*u);
458 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000459 }
460 Py_INCREF(unicode);
461 return (PyObject *)unicode;
462 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000463
464 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000465 }
466
Walter Dörwald55507312007-05-18 13:12:10 +0000467 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000468 if (!unicode)
469 return NULL;
470
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000471 return (PyObject *)unicode;
472}
473
Walter Dörwaldd2034312007-05-18 16:29:38 +0000474PyObject *PyUnicode_FromString(const char *u)
475{
476 size_t size = strlen(u);
477 if (size > PY_SSIZE_T_MAX) {
478 PyErr_SetString(PyExc_OverflowError, "input too long");
479 return NULL;
480 }
481
482 return PyUnicode_FromStringAndSize(u, size);
483}
484
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485#ifdef HAVE_WCHAR_H
486
487PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000488 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489{
490 PyUnicodeObject *unicode;
491
492 if (w == NULL) {
493 PyErr_BadInternalCall();
494 return NULL;
495 }
496
497 unicode = _PyUnicode_New(size);
498 if (!unicode)
499 return NULL;
500
501 /* Copy the wchar_t data into the new object */
502#ifdef HAVE_USABLE_WCHAR_T
503 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000504#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 {
506 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000507 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000509 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510 *u++ = *w++;
511 }
512#endif
513
514 return (PyObject *)unicode;
515}
516
Walter Dörwald346737f2007-05-31 10:44:43 +0000517static void
518makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
519{
520 *fmt++ = '%';
521 if (width) {
522 if (zeropad)
523 *fmt++ = '0';
524 fmt += sprintf(fmt, "%d", width);
525 }
526 if (precision)
527 fmt += sprintf(fmt, ".%d", precision);
528 if (longflag)
529 *fmt++ = 'l';
530 else if (size_tflag) {
531 char *f = PY_FORMAT_SIZE_T;
532 while (*f)
533 *fmt++ = *f++;
534 }
535 *fmt++ = c;
536 *fmt = '\0';
537}
538
Walter Dörwaldd2034312007-05-18 16:29:38 +0000539#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
540
541PyObject *
542PyUnicode_FromFormatV(const char *format, va_list vargs)
543{
544 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000545 Py_ssize_t callcount = 0;
546 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000547 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000548 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000549 int width = 0;
550 int precision = 0;
551 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000552 const char* f;
553 Py_UNICODE *s;
554 PyObject *string;
555 /* used by sprintf */
556 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000557 /* use abuffer instead of buffer, if we need more space
558 * (which can happen if there's a format specifier with width). */
559 char *abuffer = NULL;
560 char *realbuffer;
561 Py_ssize_t abuffersize = 0;
562 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000563 const char *copy;
564
565#ifdef VA_LIST_IS_ARRAY
566 Py_MEMCPY(count, vargs, sizeof(va_list));
567#else
568#ifdef __va_copy
569 __va_copy(count, vargs);
570#else
571 count = vargs;
572#endif
573#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 /* step 1: count the number of %S/%R format specifications
575 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
576 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000577 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000578 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000579 ++callcount;
580 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000581 /* step 2: allocate memory for the results of
582 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000583 if (callcount) {
584 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
585 if (!callresults) {
586 PyErr_NoMemory();
587 return NULL;
588 }
589 callresult = callresults;
590 }
591 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000592 for (f = format; *f; f++) {
593 if (*f == '%') {
594 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000595 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000596 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000597 width = (width*10) + *f++ - '0';
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000598 while (*++f && *f != '%' && !ISALPHA(*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000599 ;
600
601 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
602 * they don't affect the amount of space we reserve.
603 */
604 if ((*f == 'l' || *f == 'z') &&
605 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000606 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000607
608 switch (*f) {
609 case 'c':
610 (void)va_arg(count, int);
611 /* fall through... */
612 case '%':
613 n++;
614 break;
615 case 'd': case 'u': case 'i': case 'x':
616 (void) va_arg(count, int);
617 /* 20 bytes is enough to hold a 64-bit
618 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000619 This isn't enough for octal.
620 If a width is specified we need more
621 (which we allocate later). */
622 if (width < 20)
623 width = 20;
624 n += width;
625 if (abuffersize < width)
626 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000627 break;
628 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000629 {
630 /* UTF-8 */
631 unsigned char*s;
632 s = va_arg(count, unsigned char*);
633 while (*s) {
634 if (*s < 128) {
635 n++; s++;
636 } else if (*s < 0xc0) {
637 /* invalid UTF-8 */
638 n++; s++;
639 } else if (*s < 0xc0) {
640 n++;
641 s++; if(!*s)break;
642 s++;
643 } else if (*s < 0xe0) {
644 n++;
645 s++; if(!*s)break;
646 s++; if(!*s)break;
647 s++;
648 } else {
649 #ifdef Py_UNICODE_WIDE
650 n++;
651 #else
652 n+=2;
653 #endif
654 s++; if(!*s)break;
655 s++; if(!*s)break;
656 s++; if(!*s)break;
657 s++;
658 }
659 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000660 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000661 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000662 case 'U':
663 {
664 PyObject *obj = va_arg(count, PyObject *);
665 assert(obj && PyUnicode_Check(obj));
666 n += PyUnicode_GET_SIZE(obj);
667 break;
668 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000669 case 'V':
670 {
671 PyObject *obj = va_arg(count, PyObject *);
672 const char *str = va_arg(count, const char *);
673 assert(obj || str);
674 assert(!obj || PyUnicode_Check(obj));
675 if (obj)
676 n += PyUnicode_GET_SIZE(obj);
677 else
678 n += strlen(str);
679 break;
680 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000681 case 'S':
682 {
683 PyObject *obj = va_arg(count, PyObject *);
684 PyObject *str;
685 assert(obj);
686 str = PyObject_Unicode(obj);
687 if (!str)
688 goto fail;
689 n += PyUnicode_GET_SIZE(str);
690 /* Remember the str and switch to the next slot */
691 *callresult++ = str;
692 break;
693 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000694 case 'R':
695 {
696 PyObject *obj = va_arg(count, PyObject *);
697 PyObject *repr;
698 assert(obj);
699 repr = PyObject_Repr(obj);
700 if (!repr)
701 goto fail;
702 n += PyUnicode_GET_SIZE(repr);
703 /* Remember the repr and switch to the next slot */
704 *callresult++ = repr;
705 break;
706 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000707 case 'p':
708 (void) va_arg(count, int);
709 /* maximum 64-bit pointer representation:
710 * 0xffffffffffffffff
711 * so 19 characters is enough.
712 * XXX I count 18 -- what's the extra for?
713 */
714 n += 19;
715 break;
716 default:
717 /* if we stumble upon an unknown
718 formatting code, copy the rest of
719 the format string to the output
720 string. (we cannot just skip the
721 code, since there's no way to know
722 what's in the argument list) */
723 n += strlen(p);
724 goto expand;
725 }
726 } else
727 n++;
728 }
729 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000730 if (abuffersize > 20) {
731 abuffer = PyMem_Malloc(abuffersize);
732 if (!abuffer) {
733 PyErr_NoMemory();
734 goto fail;
735 }
736 realbuffer = abuffer;
737 }
738 else
739 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000740 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000741 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000742 we don't have to resize the string.
743 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 string = PyUnicode_FromUnicode(NULL, n);
745 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000746 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000747
748 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000749 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750
751 for (f = format; *f; f++) {
752 if (*f == '%') {
753 const char* p = f++;
754 int longflag = 0;
755 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000756 zeropad = (*f == '0');
757 /* parse the width.precision part */
758 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000759 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 width = (width*10) + *f++ - '0';
761 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 if (*f == '.') {
763 f++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000764 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767 /* handle the long flag, but only for %ld and %lu.
768 others can be added when necessary. */
769 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
770 longflag = 1;
771 ++f;
772 }
773 /* handle the size_t flag. */
774 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
775 size_tflag = 1;
776 ++f;
777 }
778
779 switch (*f) {
780 case 'c':
781 *s++ = va_arg(vargs, int);
782 break;
783 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000784 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000785 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000786 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000787 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000788 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000789 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000790 sprintf(realbuffer, fmt, va_arg(vargs, int));
791 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000792 break;
793 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000794 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000795 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000796 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000798 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000799 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000800 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
801 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000802 break;
803 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000804 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
805 sprintf(realbuffer, fmt, va_arg(vargs, int));
806 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000807 break;
808 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000809 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
810 sprintf(realbuffer, fmt, va_arg(vargs, int));
811 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000812 break;
813 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000814 {
815 /* Parameter must be UTF-8 encoded.
816 In case of encoding errors, use
817 the replacement character. */
818 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000819 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000820 u = PyUnicode_DecodeUTF8(p, strlen(p),
821 "replace");
822 if (!u)
823 goto fail;
824 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
825 PyUnicode_GET_SIZE(u));
826 s += PyUnicode_GET_SIZE(u);
827 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000829 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000830 case 'U':
831 {
832 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000833 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
834 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
835 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000836 break;
837 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000838 case 'V':
839 {
840 PyObject *obj = va_arg(vargs, PyObject *);
841 const char *str = va_arg(vargs, const char *);
842 if (obj) {
843 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
844 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
845 s += size;
846 } else {
847 appendstring(str);
848 }
849 break;
850 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000851 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000852 case 'R':
853 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000854 Py_UNICODE *ucopy;
855 Py_ssize_t usize;
856 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000857 /* unused, since we already have the result */
858 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000859 ucopy = PyUnicode_AS_UNICODE(*callresult);
860 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000861 for (upos = 0; upos<usize;)
862 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000863 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000864 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000865 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000866 ++callresult;
867 break;
868 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000869 case 'p':
870 sprintf(buffer, "%p", va_arg(vargs, void*));
871 /* %p is ill-defined: ensure leading 0x. */
872 if (buffer[1] == 'X')
873 buffer[1] = 'x';
874 else if (buffer[1] != 'x') {
875 memmove(buffer+2, buffer, strlen(buffer)+1);
876 buffer[0] = '0';
877 buffer[1] = 'x';
878 }
879 appendstring(buffer);
880 break;
881 case '%':
882 *s++ = '%';
883 break;
884 default:
885 appendstring(p);
886 goto end;
887 }
888 } else
889 *s++ = *f;
890 }
891
892 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000893 if (callresults)
894 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000895 if (abuffer)
896 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000897 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
898 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000899 fail:
900 if (callresults) {
901 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000902 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000903 Py_DECREF(*callresult2);
904 ++callresult2;
905 }
906 PyMem_Free(callresults);
907 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000908 if (abuffer)
909 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000910 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000911}
912
913#undef appendstring
914
915PyObject *
916PyUnicode_FromFormat(const char *format, ...)
917{
918 PyObject* ret;
919 va_list vargs;
920
921#ifdef HAVE_STDARG_PROTOTYPES
922 va_start(vargs, format);
923#else
924 va_start(vargs);
925#endif
926 ret = PyUnicode_FromFormatV(format, vargs);
927 va_end(vargs);
928 return ret;
929}
930
Martin v. Löwis18e16552006-02-15 17:27:45 +0000931Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
932 wchar_t *w,
933 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934{
935 if (unicode == NULL) {
936 PyErr_BadInternalCall();
937 return -1;
938 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000939
940 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000942 size = PyUnicode_GET_SIZE(unicode) + 1;
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944#ifdef HAVE_USABLE_WCHAR_T
945 memcpy(w, unicode->str, size * sizeof(wchar_t));
946#else
947 {
948 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000949 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000950 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000951 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000952 *w++ = *u++;
953 }
954#endif
955
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000956 if (size > PyUnicode_GET_SIZE(unicode))
957 return PyUnicode_GET_SIZE(unicode);
958 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000959 return size;
960}
961
962#endif
963
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000964PyObject *PyUnicode_FromOrdinal(int ordinal)
965{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000966 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000967
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000968 if (ordinal < 0 || ordinal > 0x10ffff) {
969 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000970 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000971 return NULL;
972 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000973
974#ifndef Py_UNICODE_WIDE
975 if (ordinal > 0xffff) {
976 ordinal -= 0x10000;
977 s[0] = 0xD800 | (ordinal >> 10);
978 s[1] = 0xDC00 | (ordinal & 0x3FF);
979 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000980 }
981#endif
982
Hye-Shik Chang40574832004-04-06 07:24:51 +0000983 s[0] = (Py_UNICODE)ordinal;
984 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000985}
986
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987PyObject *PyUnicode_FromObject(register PyObject *obj)
988{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000989 /* XXX Perhaps we should make this API an alias of
990 PyObject_Unicode() instead ?! */
991 if (PyUnicode_CheckExact(obj)) {
992 Py_INCREF(obj);
993 return obj;
994 }
995 if (PyUnicode_Check(obj)) {
996 /* For a Unicode subtype that's not a Unicode object,
997 return a true Unicode object with the same data. */
998 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
999 PyUnicode_GET_SIZE(obj));
1000 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001001 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1002}
1003
1004PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1005 const char *encoding,
1006 const char *errors)
1007{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001008 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001009 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001010 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001011
Guido van Rossumd57fd912000-03-10 22:53:23 +00001012 if (obj == NULL) {
1013 PyErr_BadInternalCall();
1014 return NULL;
1015 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001016
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001017 if (PyUnicode_Check(obj)) {
1018 PyErr_SetString(PyExc_TypeError,
1019 "decoding Unicode is not supported");
1020 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001021 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001022
1023 /* Coerce object */
1024 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001025 s = PyString_AS_STRING(obj);
1026 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001027 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001028 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1029 /* Overwrite the error message with something more useful in
1030 case of a TypeError. */
1031 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001032 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001033 "coercing to Unicode: need string or buffer, "
1034 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001035 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001036 goto onError;
1037 }
Tim Petersced69f82003-09-16 20:30:58 +00001038
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001039 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040 if (len == 0) {
1041 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001042 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 }
Tim Petersced69f82003-09-16 20:30:58 +00001044 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001045 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001046
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001047 return v;
1048
1049 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001050 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051}
1052
1053PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001054 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055 const char *encoding,
1056 const char *errors)
1057{
1058 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001059 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001060 char lower[20]; /* Enough for any encoding name we recognize */
1061 char *l;
1062 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001063
1064 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001065 encoding = PyUnicode_GetDefaultEncoding();
1066
1067 /* Convert encoding to lower case and replace '_' with '-' in order to
1068 catch e.g. UTF_8 */
1069 e = encoding;
1070 l = lower;
1071 while (*e && l < &lower[(sizeof lower) - 2]) {
1072 if (ISUPPER(*e)) {
1073 *l++ = TOLOWER(*e++);
1074 }
1075 else if (*e == '_') {
1076 *l++ = '-';
1077 e++;
1078 }
1079 else {
1080 *l++ = *e++;
1081 }
1082 }
1083 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001084
1085 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001086 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001088 else if ((strcmp(lower, "latin-1") == 0) ||
1089 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001090 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001091#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001092 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001093 return PyUnicode_DecodeMBCS(s, size, errors);
1094#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001095 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001096 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001097 else if (strcmp(lower, "utf-16") == 0)
1098 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1099 else if (strcmp(lower, "utf-32") == 0)
1100 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101
1102 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001103 buffer = NULL;
1104 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1105 goto onError;
1106 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001107 if (buffer == NULL)
1108 goto onError;
1109 unicode = PyCodec_Decode(buffer, encoding, errors);
1110 if (unicode == NULL)
1111 goto onError;
1112 if (!PyUnicode_Check(unicode)) {
1113 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001114 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001115 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 Py_DECREF(unicode);
1117 goto onError;
1118 }
1119 Py_DECREF(buffer);
1120 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001121
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 onError:
1123 Py_XDECREF(buffer);
1124 return NULL;
1125}
1126
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001127PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1128 const char *encoding,
1129 const char *errors)
1130{
1131 PyObject *v;
1132
1133 if (!PyUnicode_Check(unicode)) {
1134 PyErr_BadArgument();
1135 goto onError;
1136 }
1137
1138 if (encoding == NULL)
1139 encoding = PyUnicode_GetDefaultEncoding();
1140
1141 /* Decode via the codec registry */
1142 v = PyCodec_Decode(unicode, encoding, errors);
1143 if (v == NULL)
1144 goto onError;
1145 return v;
1146
1147 onError:
1148 return NULL;
1149}
1150
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001152 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 const char *encoding,
1154 const char *errors)
1155{
1156 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001157
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 unicode = PyUnicode_FromUnicode(s, size);
1159 if (unicode == NULL)
1160 return NULL;
1161 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1162 Py_DECREF(unicode);
1163 return v;
1164}
1165
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001166PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1167 const char *encoding,
1168 const char *errors)
1169{
1170 PyObject *v;
1171
1172 if (!PyUnicode_Check(unicode)) {
1173 PyErr_BadArgument();
1174 goto onError;
1175 }
1176
1177 if (encoding == NULL)
1178 encoding = PyUnicode_GetDefaultEncoding();
1179
1180 /* Encode via the codec registry */
1181 v = PyCodec_Encode(unicode, encoding, errors);
1182 if (v == NULL)
1183 goto onError;
1184 return v;
1185
1186 onError:
1187 return NULL;
1188}
1189
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1191 const char *encoding,
1192 const char *errors)
1193{
1194 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001195
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196 if (!PyUnicode_Check(unicode)) {
1197 PyErr_BadArgument();
1198 goto onError;
1199 }
Fred Drakee4315f52000-05-09 19:53:39 +00001200
Tim Petersced69f82003-09-16 20:30:58 +00001201 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001202 encoding = PyUnicode_GetDefaultEncoding();
1203
1204 /* Shortcuts for common default encodings */
1205 if (errors == NULL) {
1206 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001207 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001208 else if (strcmp(encoding, "latin-1") == 0)
1209 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001210#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1211 else if (strcmp(encoding, "mbcs") == 0)
1212 return PyUnicode_AsMBCSString(unicode);
1213#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001214 else if (strcmp(encoding, "ascii") == 0)
1215 return PyUnicode_AsASCIIString(unicode);
1216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218 /* Encode via the codec registry */
1219 v = PyCodec_Encode(unicode, encoding, errors);
1220 if (v == NULL)
1221 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001222 if (!PyBytes_Check(v)) {
1223 if (PyString_Check(v)) {
1224 /* Old codec, turn it into bytes */
1225 PyObject *b = PyBytes_FromObject(v);
1226 Py_DECREF(v);
1227 return b;
1228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001230 "encoder did not return a bytes object "
1231 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1232 v->ob_type->tp_name,
1233 encoding ? encoding : "NULL",
1234 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235 Py_DECREF(v);
1236 goto onError;
1237 }
1238 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001239
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 onError:
1241 return NULL;
1242}
1243
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001244PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1245 const char *errors)
1246{
1247 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001248 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001249 if (v)
1250 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001251 if (errors != NULL)
1252 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001253 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1254 PyUnicode_GET_SIZE(unicode),
1255 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001256 if (!b)
1257 return NULL;
1258 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1259 PyBytes_Size(b));
1260 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001261 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001262 return v;
1263}
1264
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001265PyObject*
1266PyUnicode_DecodeFSDefault(const char *s)
1267{
1268 Py_ssize_t size = (Py_ssize_t)strlen(s);
1269
1270 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1271 can be undefined. If it is case, decode using UTF-8. The following assumes
1272 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1273 bootstrapping process where the codecs aren't ready yet.
1274 */
1275 if (Py_FileSystemDefaultEncoding) {
1276#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1277 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs")) {
1278 return PyUnicode_DecodeMBCS(s, size, "replace");
1279 }
1280#elif defined(__APPLE__)
1281 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8")) {
1282 return PyUnicode_DecodeUTF8(s, size, "replace");
1283 }
1284#endif
1285 return PyUnicode_Decode(s, size,
1286 Py_FileSystemDefaultEncoding,
1287 "replace");
1288 }
1289 else {
1290 return PyUnicode_DecodeUTF8(s, size, "replace");
1291 }
1292}
1293
Martin v. Löwis5b222132007-06-10 09:51:05 +00001294char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001295PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001296{
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001297 PyObject *str8;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001298 if (!PyUnicode_Check(unicode)) {
1299 PyErr_BadArgument();
1300 return NULL;
1301 }
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001302 str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1303 if (str8 == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001304 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001305 if (psize != NULL)
1306 *psize = PyString_GET_SIZE(str8);
1307 return PyString_AS_STRING(str8);
1308}
1309
1310char*
1311PyUnicode_AsString(PyObject *unicode)
1312{
1313 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001314}
1315
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1317{
1318 if (!PyUnicode_Check(unicode)) {
1319 PyErr_BadArgument();
1320 goto onError;
1321 }
1322 return PyUnicode_AS_UNICODE(unicode);
1323
1324 onError:
1325 return NULL;
1326}
1327
Martin v. Löwis18e16552006-02-15 17:27:45 +00001328Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329{
1330 if (!PyUnicode_Check(unicode)) {
1331 PyErr_BadArgument();
1332 goto onError;
1333 }
1334 return PyUnicode_GET_SIZE(unicode);
1335
1336 onError:
1337 return -1;
1338}
1339
Thomas Wouters78890102000-07-22 19:25:51 +00001340const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001341{
1342 return unicode_default_encoding;
1343}
1344
1345int PyUnicode_SetDefaultEncoding(const char *encoding)
1346{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001347 if (strcmp(encoding, unicode_default_encoding) != 0) {
1348 PyErr_Format(PyExc_ValueError,
1349 "Can only set default encoding to %s",
1350 unicode_default_encoding);
1351 return -1;
1352 }
Fred Drakee4315f52000-05-09 19:53:39 +00001353 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001354}
1355
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001356/* error handling callback helper:
1357 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001358 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001359 and adjust various state variables.
1360 return 0 on success, -1 on error
1361*/
1362
1363static
1364int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1365 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001366 const char **input, const char **inend, Py_ssize_t *startinpos,
1367 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001368 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001369{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001370 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001371
1372 PyObject *restuple = NULL;
1373 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001374 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001375 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001376 Py_ssize_t requiredsize;
1377 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001378 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001379 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001380 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001381 int res = -1;
1382
1383 if (*errorHandler == NULL) {
1384 *errorHandler = PyCodec_LookupError(errors);
1385 if (*errorHandler == NULL)
1386 goto onError;
1387 }
1388
1389 if (*exceptionObject == NULL) {
1390 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001391 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001392 if (*exceptionObject == NULL)
1393 goto onError;
1394 }
1395 else {
1396 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1397 goto onError;
1398 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1399 goto onError;
1400 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1401 goto onError;
1402 }
1403
1404 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1405 if (restuple == NULL)
1406 goto onError;
1407 if (!PyTuple_Check(restuple)) {
1408 PyErr_Format(PyExc_TypeError, &argparse[4]);
1409 goto onError;
1410 }
1411 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1412 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001413
1414 /* Copy back the bytes variables, which might have been modified by the
1415 callback */
1416 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1417 if (!inputobj)
1418 goto onError;
1419 if (!PyBytes_Check(inputobj)) {
1420 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1421 }
1422 *input = PyBytes_AS_STRING(inputobj);
1423 insize = PyBytes_GET_SIZE(inputobj);
1424 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001425 /* we can DECREF safely, as the exception has another reference,
1426 so the object won't go away. */
1427 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001428
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001430 newpos = insize+newpos;
1431 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001432 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001433 goto onError;
1434 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435
1436 /* need more space? (at least enough for what we
1437 have+the replacement+the rest of the string (starting
1438 at the new input position), so we won't have to check space
1439 when there are no errors in the rest of the string) */
1440 repptr = PyUnicode_AS_UNICODE(repunicode);
1441 repsize = PyUnicode_GET_SIZE(repunicode);
1442 requiredsize = *outpos + repsize + insize-newpos;
1443 if (requiredsize > outsize) {
1444 if (requiredsize<2*outsize)
1445 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001446 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001447 goto onError;
1448 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1449 }
1450 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001451 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452 Py_UNICODE_COPY(*outptr, repptr, repsize);
1453 *outptr += repsize;
1454 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001455
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001456 /* we made it! */
1457 res = 0;
1458
1459 onError:
1460 Py_XDECREF(restuple);
1461 return res;
1462}
1463
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001464/* --- UTF-7 Codec -------------------------------------------------------- */
1465
1466/* see RFC2152 for details */
1467
Tim Petersced69f82003-09-16 20:30:58 +00001468static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001469char utf7_special[128] = {
1470 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1471 encoded:
1472 0 - not special
1473 1 - special
1474 2 - whitespace (optional)
1475 3 - RFC2152 Set O (optional) */
1476 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1477 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1478 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1480 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1482 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1484
1485};
1486
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001487/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1488 warnings about the comparison always being false; since
1489 utf7_special[0] is 1, we can safely make that one comparison
1490 true */
1491
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001492#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001493 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001494 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001495 (encodeO && (utf7_special[(c)] == 3)))
1496
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001497#define B64(n) \
1498 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1499#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001500 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001501#define UB64(c) \
1502 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1503 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001504
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001505#define ENCODE(out, ch, bits) \
1506 while (bits >= 6) { \
1507 *out++ = B64(ch >> (bits-6)); \
1508 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001509 }
1510
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001511#define DECODE(out, ch, bits, surrogate) \
1512 while (bits >= 16) { \
1513 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1514 bits -= 16; \
1515 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001516 /* We have already generated an error for the high surrogate \
1517 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001518 surrogate = 0; \
1519 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001520 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001521 it in a 16-bit character */ \
1522 surrogate = 1; \
1523 errmsg = "code pairs are not supported"; \
1524 goto utf7Error; \
1525 } else { \
1526 *out++ = outCh; \
1527 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001528 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001529
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001530PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001531 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001532 const char *errors)
1533{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001534 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001535 Py_ssize_t startinpos;
1536 Py_ssize_t endinpos;
1537 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001538 const char *e;
1539 PyUnicodeObject *unicode;
1540 Py_UNICODE *p;
1541 const char *errmsg = "";
1542 int inShift = 0;
1543 unsigned int bitsleft = 0;
1544 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 int surrogate = 0;
1546 PyObject *errorHandler = NULL;
1547 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001548
1549 unicode = _PyUnicode_New(size);
1550 if (!unicode)
1551 return NULL;
1552 if (size == 0)
1553 return (PyObject *)unicode;
1554
1555 p = unicode->str;
1556 e = s + size;
1557
1558 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001559 Py_UNICODE ch;
1560 restart:
1561 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001562
1563 if (inShift) {
1564 if ((ch == '-') || !B64CHAR(ch)) {
1565 inShift = 0;
1566 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001567
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001568 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1569 if (bitsleft >= 6) {
1570 /* The shift sequence has a partial character in it. If
1571 bitsleft < 6 then we could just classify it as padding
1572 but that is not the case here */
1573
1574 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001575 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001576 }
1577 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001578 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001579 here so indicate the potential of a misencoded character. */
1580
1581 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1582 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1583 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001584 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001585 }
1586
1587 if (ch == '-') {
1588 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001589 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001590 inShift = 1;
1591 }
1592 } else if (SPECIAL(ch,0,0)) {
1593 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001594 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 } else {
1596 *p++ = ch;
1597 }
1598 } else {
1599 charsleft = (charsleft << 6) | UB64(ch);
1600 bitsleft += 6;
1601 s++;
1602 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1603 }
1604 }
1605 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001606 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607 s++;
1608 if (s < e && *s == '-') {
1609 s++;
1610 *p++ = '+';
1611 } else
1612 {
1613 inShift = 1;
1614 bitsleft = 0;
1615 }
1616 }
1617 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001618 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001619 errmsg = "unexpected special character";
1620 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001621 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622 }
1623 else {
1624 *p++ = ch;
1625 s++;
1626 }
1627 continue;
1628 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001629 outpos = p-PyUnicode_AS_UNICODE(unicode);
1630 endinpos = s-starts;
1631 if (unicode_decode_call_errorhandler(
1632 errors, &errorHandler,
1633 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001634 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001635 (PyObject **)&unicode, &outpos, &p))
1636 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001637 }
1638
1639 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001640 outpos = p-PyUnicode_AS_UNICODE(unicode);
1641 endinpos = size;
1642 if (unicode_decode_call_errorhandler(
1643 errors, &errorHandler,
1644 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001645 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001647 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001648 if (s < e)
1649 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 }
1651
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001652 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 goto onError;
1654
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001655 Py_XDECREF(errorHandler);
1656 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657 return (PyObject *)unicode;
1658
1659onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001660 Py_XDECREF(errorHandler);
1661 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662 Py_DECREF(unicode);
1663 return NULL;
1664}
1665
1666
1667PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001668 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001669 int encodeSetO,
1670 int encodeWhiteSpace,
1671 const char *errors)
1672{
1673 PyObject *v;
1674 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001675 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001676 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001677 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001678 unsigned int bitsleft = 0;
1679 unsigned long charsleft = 0;
1680 char * out;
1681 char * start;
1682
1683 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001684 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685
Walter Dörwald51ab4142007-05-05 14:43:36 +00001686 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001687 if (v == NULL)
1688 return NULL;
1689
Walter Dörwald51ab4142007-05-05 14:43:36 +00001690 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 for (;i < size; ++i) {
1692 Py_UNICODE ch = s[i];
1693
1694 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001695 if (ch == '+') {
1696 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001697 *out++ = '-';
1698 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1699 charsleft = ch;
1700 bitsleft = 16;
1701 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001702 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001703 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001704 } else {
1705 *out++ = (char) ch;
1706 }
1707 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001708 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1709 *out++ = B64(charsleft << (6-bitsleft));
1710 charsleft = 0;
1711 bitsleft = 0;
1712 /* Characters not in the BASE64 set implicitly unshift the sequence
1713 so no '-' is required, except if the character is itself a '-' */
1714 if (B64CHAR(ch) || ch == '-') {
1715 *out++ = '-';
1716 }
1717 inShift = 0;
1718 *out++ = (char) ch;
1719 } else {
1720 bitsleft += 16;
1721 charsleft = (charsleft << 16) | ch;
1722 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1723
1724 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001725 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001726 or '-' then the shift sequence will be terminated implicitly and we
1727 don't have to insert a '-'. */
1728
1729 if (bitsleft == 0) {
1730 if (i + 1 < size) {
1731 Py_UNICODE ch2 = s[i+1];
1732
1733 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001734
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001735 } else if (B64CHAR(ch2) || ch2 == '-') {
1736 *out++ = '-';
1737 inShift = 0;
1738 } else {
1739 inShift = 0;
1740 }
1741
1742 }
1743 else {
1744 *out++ = '-';
1745 inShift = 0;
1746 }
1747 }
Tim Petersced69f82003-09-16 20:30:58 +00001748 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001750 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001751 if (bitsleft) {
1752 *out++= B64(charsleft << (6-bitsleft) );
1753 *out++ = '-';
1754 }
1755
Walter Dörwald51ab4142007-05-05 14:43:36 +00001756 if (PyBytes_Resize(v, out - start)) {
1757 Py_DECREF(v);
1758 return NULL;
1759 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001760 return v;
1761}
1762
1763#undef SPECIAL
1764#undef B64
1765#undef B64CHAR
1766#undef UB64
1767#undef ENCODE
1768#undef DECODE
1769
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770/* --- UTF-8 Codec -------------------------------------------------------- */
1771
Tim Petersced69f82003-09-16 20:30:58 +00001772static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773char utf8_code_length[256] = {
1774 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1775 illegal prefix. see RFC 2279 for details */
1776 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1777 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1778 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1779 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1780 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1781 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1782 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1783 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1785 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1786 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1787 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1788 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1789 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1790 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1791 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1792};
1793
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001795 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796 const char *errors)
1797{
Walter Dörwald69652032004-09-07 20:24:22 +00001798 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1799}
1800
1801PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001802 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001803 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001804 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001805{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001808 Py_ssize_t startinpos;
1809 Py_ssize_t endinpos;
1810 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811 const char *e;
1812 PyUnicodeObject *unicode;
1813 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001814 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 PyObject *errorHandler = NULL;
1816 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001817
1818 /* Note: size will always be longer than the resulting Unicode
1819 character count */
1820 unicode = _PyUnicode_New(size);
1821 if (!unicode)
1822 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001823 if (size == 0) {
1824 if (consumed)
1825 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001827 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001828
1829 /* Unpack UTF-8 encoded data */
1830 p = unicode->str;
1831 e = s + size;
1832
1833 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001834 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835
1836 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001837 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 s++;
1839 continue;
1840 }
1841
1842 n = utf8_code_length[ch];
1843
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001844 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001845 if (consumed)
1846 break;
1847 else {
1848 errmsg = "unexpected end of data";
1849 startinpos = s-starts;
1850 endinpos = size;
1851 goto utf8Error;
1852 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
1855 switch (n) {
1856
1857 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001858 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001859 startinpos = s-starts;
1860 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001861 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862
1863 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001864 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001865 startinpos = s-starts;
1866 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001867 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868
1869 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001870 if ((s[1] & 0xc0) != 0x80) {
1871 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001872 startinpos = s-starts;
1873 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001874 goto utf8Error;
1875 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001877 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001878 startinpos = s-starts;
1879 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001880 errmsg = "illegal encoding";
1881 goto utf8Error;
1882 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001884 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 break;
1886
1887 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001888 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001889 (s[2] & 0xc0) != 0x80) {
1890 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001891 startinpos = s-starts;
1892 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001893 goto utf8Error;
1894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001896 if (ch < 0x0800) {
1897 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001898 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001899
1900 XXX For wide builds (UCS-4) we should probably try
1901 to recombine the surrogates into a single code
1902 unit.
1903 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001904 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001905 startinpos = s-starts;
1906 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001907 goto utf8Error;
1908 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001910 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001911 break;
1912
1913 case 4:
1914 if ((s[1] & 0xc0) != 0x80 ||
1915 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001916 (s[3] & 0xc0) != 0x80) {
1917 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001918 startinpos = s-starts;
1919 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001920 goto utf8Error;
1921 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001922 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1923 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1924 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001925 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001926 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001927 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001928 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001929 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001930 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001931 startinpos = s-starts;
1932 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001933 goto utf8Error;
1934 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001935#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001936 *p++ = (Py_UNICODE)ch;
1937#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001938 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001939
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001940 /* translate from 10000..10FFFF to 0..FFFF */
1941 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001942
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001943 /* high surrogate = top 10 bits added to D800 */
1944 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001945
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001946 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001947 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001948#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 break;
1950
1951 default:
1952 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001953 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001954 startinpos = s-starts;
1955 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001956 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957 }
1958 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001960
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001961 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001962 outpos = p-PyUnicode_AS_UNICODE(unicode);
1963 if (unicode_decode_call_errorhandler(
1964 errors, &errorHandler,
1965 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001966 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001967 (PyObject **)&unicode, &outpos, &p))
1968 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969 }
Walter Dörwald69652032004-09-07 20:24:22 +00001970 if (consumed)
1971 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972
1973 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001974 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 goto onError;
1976
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001977 Py_XDECREF(errorHandler);
1978 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 return (PyObject *)unicode;
1980
1981onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001982 Py_XDECREF(errorHandler);
1983 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 Py_DECREF(unicode);
1985 return NULL;
1986}
1987
Tim Peters602f7402002-04-27 18:03:26 +00001988/* Allocation strategy: if the string is short, convert into a stack buffer
1989 and allocate exactly as much space needed at the end. Else allocate the
1990 maximum possible needed (4 result bytes per Unicode character), and return
1991 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001992*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001993PyObject *
1994PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001995 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001996 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997{
Tim Peters602f7402002-04-27 18:03:26 +00001998#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001999
Martin v. Löwis18e16552006-02-15 17:27:45 +00002000 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002001 PyObject *v; /* result string object */
2002 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002003 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002004 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002005 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002006
Tim Peters602f7402002-04-27 18:03:26 +00002007 assert(s != NULL);
2008 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009
Tim Peters602f7402002-04-27 18:03:26 +00002010 if (size <= MAX_SHORT_UNICHARS) {
2011 /* Write into the stack buffer; nallocated can't overflow.
2012 * At the end, we'll allocate exactly as much heap space as it
2013 * turns out we need.
2014 */
2015 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2016 v = NULL; /* will allocate after we're done */
2017 p = stackbuf;
2018 }
2019 else {
2020 /* Overallocate on the heap, and give the excess back at the end. */
2021 nallocated = size * 4;
2022 if (nallocated / 4 != size) /* overflow! */
2023 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002024 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002025 if (v == NULL)
2026 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002027 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002028 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002029
Tim Peters602f7402002-04-27 18:03:26 +00002030 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002031 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002032
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002033 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002034 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002036
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002038 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002039 *p++ = (char)(0xc0 | (ch >> 6));
2040 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002041 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002042 else {
Tim Peters602f7402002-04-27 18:03:26 +00002043 /* Encode UCS2 Unicode ordinals */
2044 if (ch < 0x10000) {
2045 /* Special case: check for high surrogate */
2046 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2047 Py_UCS4 ch2 = s[i];
2048 /* Check for low surrogate and combine the two to
2049 form a UCS4 value */
2050 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002051 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002052 i++;
2053 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002054 }
Tim Peters602f7402002-04-27 18:03:26 +00002055 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002056 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002057 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002058 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2059 *p++ = (char)(0x80 | (ch & 0x3f));
2060 continue;
2061 }
2062encodeUCS4:
2063 /* Encode UCS4 Unicode ordinals */
2064 *p++ = (char)(0xf0 | (ch >> 18));
2065 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2066 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2067 *p++ = (char)(0x80 | (ch & 0x3f));
2068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002070
Tim Peters602f7402002-04-27 18:03:26 +00002071 if (v == NULL) {
2072 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002073 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002074 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002075 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002076 }
2077 else {
2078 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002079 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002080 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002081 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002082 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002083 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002084
Tim Peters602f7402002-04-27 18:03:26 +00002085#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086}
2087
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2089{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090 if (!PyUnicode_Check(unicode)) {
2091 PyErr_BadArgument();
2092 return NULL;
2093 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002094 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2095 PyUnicode_GET_SIZE(unicode),
2096 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097}
2098
Walter Dörwald41980ca2007-08-16 21:55:45 +00002099/* --- UTF-32 Codec ------------------------------------------------------- */
2100
2101PyObject *
2102PyUnicode_DecodeUTF32(const char *s,
2103 Py_ssize_t size,
2104 const char *errors,
2105 int *byteorder)
2106{
2107 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2108}
2109
2110PyObject *
2111PyUnicode_DecodeUTF32Stateful(const char *s,
2112 Py_ssize_t size,
2113 const char *errors,
2114 int *byteorder,
2115 Py_ssize_t *consumed)
2116{
2117 const char *starts = s;
2118 Py_ssize_t startinpos;
2119 Py_ssize_t endinpos;
2120 Py_ssize_t outpos;
2121 PyUnicodeObject *unicode;
2122 Py_UNICODE *p;
2123#ifndef Py_UNICODE_WIDE
2124 int i, pairs;
2125#else
2126 const int pairs = 0;
2127#endif
2128 const unsigned char *q, *e;
2129 int bo = 0; /* assume native ordering by default */
2130 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002131 /* Offsets from q for retrieving bytes in the right order. */
2132#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2133 int iorder[] = {0, 1, 2, 3};
2134#else
2135 int iorder[] = {3, 2, 1, 0};
2136#endif
2137 PyObject *errorHandler = NULL;
2138 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002139 /* On narrow builds we split characters outside the BMP into two
2140 codepoints => count how much extra space we need. */
2141#ifndef Py_UNICODE_WIDE
2142 for (i = pairs = 0; i < size/4; i++)
2143 if (((Py_UCS4 *)s)[i] >= 0x10000)
2144 pairs++;
2145#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002146
2147 /* This might be one to much, because of a BOM */
2148 unicode = _PyUnicode_New((size+3)/4+pairs);
2149 if (!unicode)
2150 return NULL;
2151 if (size == 0)
2152 return (PyObject *)unicode;
2153
2154 /* Unpack UTF-32 encoded data */
2155 p = unicode->str;
2156 q = (unsigned char *)s;
2157 e = q + size;
2158
2159 if (byteorder)
2160 bo = *byteorder;
2161
2162 /* Check for BOM marks (U+FEFF) in the input and adjust current
2163 byte order setting accordingly. In native mode, the leading BOM
2164 mark is skipped, in all other modes, it is copied to the output
2165 stream as-is (giving a ZWNBSP character). */
2166 if (bo == 0) {
2167 if (size >= 4) {
2168 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2169 (q[iorder[1]] << 8) | q[iorder[0]];
2170#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2171 if (bom == 0x0000FEFF) {
2172 q += 4;
2173 bo = -1;
2174 }
2175 else if (bom == 0xFFFE0000) {
2176 q += 4;
2177 bo = 1;
2178 }
2179#else
2180 if (bom == 0x0000FEFF) {
2181 q += 4;
2182 bo = 1;
2183 }
2184 else if (bom == 0xFFFE0000) {
2185 q += 4;
2186 bo = -1;
2187 }
2188#endif
2189 }
2190 }
2191
2192 if (bo == -1) {
2193 /* force LE */
2194 iorder[0] = 0;
2195 iorder[1] = 1;
2196 iorder[2] = 2;
2197 iorder[3] = 3;
2198 }
2199 else if (bo == 1) {
2200 /* force BE */
2201 iorder[0] = 3;
2202 iorder[1] = 2;
2203 iorder[2] = 1;
2204 iorder[3] = 0;
2205 }
2206
2207 while (q < e) {
2208 Py_UCS4 ch;
2209 /* remaining bytes at the end? (size should be divisible by 4) */
2210 if (e-q<4) {
2211 if (consumed)
2212 break;
2213 errmsg = "truncated data";
2214 startinpos = ((const char *)q)-starts;
2215 endinpos = ((const char *)e)-starts;
2216 goto utf32Error;
2217 /* The remaining input chars are ignored if the callback
2218 chooses to skip the input */
2219 }
2220 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2221 (q[iorder[1]] << 8) | q[iorder[0]];
2222
2223 if (ch >= 0x110000)
2224 {
2225 errmsg = "codepoint not in range(0x110000)";
2226 startinpos = ((const char *)q)-starts;
2227 endinpos = startinpos+4;
2228 goto utf32Error;
2229 }
2230#ifndef Py_UNICODE_WIDE
2231 if (ch >= 0x10000)
2232 {
2233 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2234 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2235 }
2236 else
2237#endif
2238 *p++ = ch;
2239 q += 4;
2240 continue;
2241 utf32Error:
2242 outpos = p-PyUnicode_AS_UNICODE(unicode);
2243 if (unicode_decode_call_errorhandler(
2244 errors, &errorHandler,
2245 "utf32", errmsg,
2246 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2247 (PyObject **)&unicode, &outpos, &p))
2248 goto onError;
2249 }
2250
2251 if (byteorder)
2252 *byteorder = bo;
2253
2254 if (consumed)
2255 *consumed = (const char *)q-starts;
2256
2257 /* Adjust length */
2258 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2259 goto onError;
2260
2261 Py_XDECREF(errorHandler);
2262 Py_XDECREF(exc);
2263 return (PyObject *)unicode;
2264
2265onError:
2266 Py_DECREF(unicode);
2267 Py_XDECREF(errorHandler);
2268 Py_XDECREF(exc);
2269 return NULL;
2270}
2271
2272PyObject *
2273PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2274 Py_ssize_t size,
2275 const char *errors,
2276 int byteorder)
2277{
2278 PyObject *v;
2279 unsigned char *p;
2280#ifndef Py_UNICODE_WIDE
2281 int i, pairs;
2282#else
2283 const int pairs = 0;
2284#endif
2285 /* Offsets from p for storing byte pairs in the right order. */
2286#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2287 int iorder[] = {0, 1, 2, 3};
2288#else
2289 int iorder[] = {3, 2, 1, 0};
2290#endif
2291
2292#define STORECHAR(CH) \
2293 do { \
2294 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2295 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2296 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2297 p[iorder[0]] = (CH) & 0xff; \
2298 p += 4; \
2299 } while(0)
2300
2301 /* In narrow builds we can output surrogate pairs as one codepoint,
2302 so we need less space. */
2303#ifndef Py_UNICODE_WIDE
2304 for (i = pairs = 0; i < size-1; i++)
2305 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2306 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2307 pairs++;
2308#endif
2309 v = PyBytes_FromStringAndSize(NULL,
2310 4 * (size - pairs + (byteorder == 0)));
2311 if (v == NULL)
2312 return NULL;
2313
2314 p = (unsigned char *)PyBytes_AS_STRING(v);
2315 if (byteorder == 0)
2316 STORECHAR(0xFEFF);
2317 if (size == 0)
2318 return v;
2319
2320 if (byteorder == -1) {
2321 /* force LE */
2322 iorder[0] = 0;
2323 iorder[1] = 1;
2324 iorder[2] = 2;
2325 iorder[3] = 3;
2326 }
2327 else if (byteorder == 1) {
2328 /* force BE */
2329 iorder[0] = 3;
2330 iorder[1] = 2;
2331 iorder[2] = 1;
2332 iorder[3] = 0;
2333 }
2334
2335 while (size-- > 0) {
2336 Py_UCS4 ch = *s++;
2337#ifndef Py_UNICODE_WIDE
2338 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2339 Py_UCS4 ch2 = *s;
2340 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2341 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2342 s++;
2343 size--;
2344 }
2345 }
2346#endif
2347 STORECHAR(ch);
2348 }
2349 return v;
2350#undef STORECHAR
2351}
2352
2353PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2354{
2355 if (!PyUnicode_Check(unicode)) {
2356 PyErr_BadArgument();
2357 return NULL;
2358 }
2359 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2360 PyUnicode_GET_SIZE(unicode),
2361 NULL,
2362 0);
2363}
2364
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365/* --- UTF-16 Codec ------------------------------------------------------- */
2366
Tim Peters772747b2001-08-09 22:21:55 +00002367PyObject *
2368PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002369 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002370 const char *errors,
2371 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002372{
Walter Dörwald69652032004-09-07 20:24:22 +00002373 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2374}
2375
2376PyObject *
2377PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002378 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002379 const char *errors,
2380 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002381 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002382{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002383 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002384 Py_ssize_t startinpos;
2385 Py_ssize_t endinpos;
2386 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 PyUnicodeObject *unicode;
2388 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002389 const unsigned char *q, *e;
2390 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002391 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002392 /* Offsets from q for retrieving byte pairs in the right order. */
2393#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2394 int ihi = 1, ilo = 0;
2395#else
2396 int ihi = 0, ilo = 1;
2397#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 PyObject *errorHandler = NULL;
2399 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400
2401 /* Note: size will always be longer than the resulting Unicode
2402 character count */
2403 unicode = _PyUnicode_New(size);
2404 if (!unicode)
2405 return NULL;
2406 if (size == 0)
2407 return (PyObject *)unicode;
2408
2409 /* Unpack UTF-16 encoded data */
2410 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002411 q = (unsigned char *)s;
2412 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413
2414 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002415 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002416
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002417 /* Check for BOM marks (U+FEFF) in the input and adjust current
2418 byte order setting accordingly. In native mode, the leading BOM
2419 mark is skipped, in all other modes, it is copied to the output
2420 stream as-is (giving a ZWNBSP character). */
2421 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002422 if (size >= 2) {
2423 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002424#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002425 if (bom == 0xFEFF) {
2426 q += 2;
2427 bo = -1;
2428 }
2429 else if (bom == 0xFFFE) {
2430 q += 2;
2431 bo = 1;
2432 }
Tim Petersced69f82003-09-16 20:30:58 +00002433#else
Walter Dörwald69652032004-09-07 20:24:22 +00002434 if (bom == 0xFEFF) {
2435 q += 2;
2436 bo = 1;
2437 }
2438 else if (bom == 0xFFFE) {
2439 q += 2;
2440 bo = -1;
2441 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002442#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002443 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002444 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002445
Tim Peters772747b2001-08-09 22:21:55 +00002446 if (bo == -1) {
2447 /* force LE */
2448 ihi = 1;
2449 ilo = 0;
2450 }
2451 else if (bo == 1) {
2452 /* force BE */
2453 ihi = 0;
2454 ilo = 1;
2455 }
2456
2457 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002458 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002459 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002460 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002461 if (consumed)
2462 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002463 errmsg = "truncated data";
2464 startinpos = ((const char *)q)-starts;
2465 endinpos = ((const char *)e)-starts;
2466 goto utf16Error;
2467 /* The remaining input chars are ignored if the callback
2468 chooses to skip the input */
2469 }
2470 ch = (q[ihi] << 8) | q[ilo];
2471
Tim Peters772747b2001-08-09 22:21:55 +00002472 q += 2;
2473
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 if (ch < 0xD800 || ch > 0xDFFF) {
2475 *p++ = ch;
2476 continue;
2477 }
2478
2479 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002480 if (q >= e) {
2481 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002482 startinpos = (((const char *)q)-2)-starts;
2483 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002484 goto utf16Error;
2485 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002486 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002487 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2488 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002489 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002490#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002491 *p++ = ch;
2492 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002493#else
2494 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002495#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002496 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002497 }
2498 else {
2499 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500 startinpos = (((const char *)q)-4)-starts;
2501 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002502 goto utf16Error;
2503 }
2504
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002506 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002507 startinpos = (((const char *)q)-2)-starts;
2508 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002509 /* Fall through to report the error */
2510
2511 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002512 outpos = p-PyUnicode_AS_UNICODE(unicode);
2513 if (unicode_decode_call_errorhandler(
2514 errors, &errorHandler,
2515 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002516 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002517 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002518 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 }
2520
2521 if (byteorder)
2522 *byteorder = bo;
2523
Walter Dörwald69652032004-09-07 20:24:22 +00002524 if (consumed)
2525 *consumed = (const char *)q-starts;
2526
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002528 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 goto onError;
2530
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002531 Py_XDECREF(errorHandler);
2532 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533 return (PyObject *)unicode;
2534
2535onError:
2536 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002537 Py_XDECREF(errorHandler);
2538 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 return NULL;
2540}
2541
Tim Peters772747b2001-08-09 22:21:55 +00002542PyObject *
2543PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002544 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002545 const char *errors,
2546 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547{
2548 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002549 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002550#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002551 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002552#else
2553 const int pairs = 0;
2554#endif
Tim Peters772747b2001-08-09 22:21:55 +00002555 /* Offsets from p for storing byte pairs in the right order. */
2556#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2557 int ihi = 1, ilo = 0;
2558#else
2559 int ihi = 0, ilo = 1;
2560#endif
2561
2562#define STORECHAR(CH) \
2563 do { \
2564 p[ihi] = ((CH) >> 8) & 0xff; \
2565 p[ilo] = (CH) & 0xff; \
2566 p += 2; \
2567 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002569#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002570 for (i = pairs = 0; i < size; i++)
2571 if (s[i] >= 0x10000)
2572 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002573#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002574 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002575 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576 if (v == NULL)
2577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578
Walter Dörwald3cc34522007-05-04 10:48:27 +00002579 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002581 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002582 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002583 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002584
2585 if (byteorder == -1) {
2586 /* force LE */
2587 ihi = 1;
2588 ilo = 0;
2589 }
2590 else if (byteorder == 1) {
2591 /* force BE */
2592 ihi = 0;
2593 ilo = 1;
2594 }
2595
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002596 while (size-- > 0) {
2597 Py_UNICODE ch = *s++;
2598 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002599#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002600 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002601 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2602 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002604#endif
Tim Peters772747b2001-08-09 22:21:55 +00002605 STORECHAR(ch);
2606 if (ch2)
2607 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002610#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611}
2612
2613PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2614{
2615 if (!PyUnicode_Check(unicode)) {
2616 PyErr_BadArgument();
2617 return NULL;
2618 }
2619 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2620 PyUnicode_GET_SIZE(unicode),
2621 NULL,
2622 0);
2623}
2624
2625/* --- Unicode Escape Codec ----------------------------------------------- */
2626
Fredrik Lundh06d12682001-01-24 07:59:11 +00002627static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002628
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002630 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002631 const char *errors)
2632{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002633 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002634 Py_ssize_t startinpos;
2635 Py_ssize_t endinpos;
2636 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002637 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002639 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002641 char* message;
2642 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002643 PyObject *errorHandler = NULL;
2644 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002645
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646 /* Escaped strings will always be longer than the resulting
2647 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002648 length after conversion to the true value.
2649 (but if the error callback returns a long replacement string
2650 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 v = _PyUnicode_New(size);
2652 if (v == NULL)
2653 goto onError;
2654 if (size == 0)
2655 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002656
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002657 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002659
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660 while (s < end) {
2661 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002662 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002663 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664
2665 /* Non-escape characters are interpreted as Unicode ordinals */
2666 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002667 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 continue;
2669 }
2670
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002671 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672 /* \ - Escapes */
2673 s++;
2674 switch (*s++) {
2675
2676 /* \x escapes */
2677 case '\n': break;
2678 case '\\': *p++ = '\\'; break;
2679 case '\'': *p++ = '\''; break;
2680 case '\"': *p++ = '\"'; break;
2681 case 'b': *p++ = '\b'; break;
2682 case 'f': *p++ = '\014'; break; /* FF */
2683 case 't': *p++ = '\t'; break;
2684 case 'n': *p++ = '\n'; break;
2685 case 'r': *p++ = '\r'; break;
2686 case 'v': *p++ = '\013'; break; /* VT */
2687 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2688
2689 /* \OOO (octal) escapes */
2690 case '0': case '1': case '2': case '3':
2691 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002692 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002694 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002696 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002698 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 break;
2700
Fredrik Lundhccc74732001-02-18 22:13:49 +00002701 /* hex escapes */
2702 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002704 digits = 2;
2705 message = "truncated \\xXX escape";
2706 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707
Fredrik Lundhccc74732001-02-18 22:13:49 +00002708 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002710 digits = 4;
2711 message = "truncated \\uXXXX escape";
2712 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713
Fredrik Lundhccc74732001-02-18 22:13:49 +00002714 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002715 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002716 digits = 8;
2717 message = "truncated \\UXXXXXXXX escape";
2718 hexescape:
2719 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002720 outpos = p-PyUnicode_AS_UNICODE(v);
2721 if (s+digits>end) {
2722 endinpos = size;
2723 if (unicode_decode_call_errorhandler(
2724 errors, &errorHandler,
2725 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002726 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 (PyObject **)&v, &outpos, &p))
2728 goto onError;
2729 goto nextByte;
2730 }
2731 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002732 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002733 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 endinpos = (s+i+1)-starts;
2735 if (unicode_decode_call_errorhandler(
2736 errors, &errorHandler,
2737 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002738 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002739 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002740 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002742 }
2743 chr = (chr<<4) & ~0xF;
2744 if (c >= '0' && c <= '9')
2745 chr += c - '0';
2746 else if (c >= 'a' && c <= 'f')
2747 chr += 10 + c - 'a';
2748 else
2749 chr += 10 + c - 'A';
2750 }
2751 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002752 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002753 /* _decoding_error will have already written into the
2754 target buffer. */
2755 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002756 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002757 /* when we get here, chr is a 32-bit unicode character */
2758 if (chr <= 0xffff)
2759 /* UCS-2 character */
2760 *p++ = (Py_UNICODE) chr;
2761 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002762 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002763 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002764#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002765 *p++ = chr;
2766#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002767 chr -= 0x10000L;
2768 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002769 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002770#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002771 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 endinpos = s-starts;
2773 outpos = p-PyUnicode_AS_UNICODE(v);
2774 if (unicode_decode_call_errorhandler(
2775 errors, &errorHandler,
2776 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002777 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002778 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002779 goto onError;
2780 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002781 break;
2782
2783 /* \N{name} */
2784 case 'N':
2785 message = "malformed \\N character escape";
2786 if (ucnhash_CAPI == NULL) {
2787 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002788 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002789 m = PyImport_ImportModule("unicodedata");
2790 if (m == NULL)
2791 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002792 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002793 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002794 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002795 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002796 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002797 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002798 if (ucnhash_CAPI == NULL)
2799 goto ucnhashError;
2800 }
2801 if (*s == '{') {
2802 const char *start = s+1;
2803 /* look for the closing brace */
2804 while (*s != '}' && s < end)
2805 s++;
2806 if (s > start && s < end && *s == '}') {
2807 /* found a name. look it up in the unicode database */
2808 message = "unknown Unicode character name";
2809 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002810 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002811 goto store;
2812 }
2813 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002814 endinpos = s-starts;
2815 outpos = p-PyUnicode_AS_UNICODE(v);
2816 if (unicode_decode_call_errorhandler(
2817 errors, &errorHandler,
2818 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002819 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002821 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002822 break;
2823
2824 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002825 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 message = "\\ at end of string";
2827 s--;
2828 endinpos = s-starts;
2829 outpos = p-PyUnicode_AS_UNICODE(v);
2830 if (unicode_decode_call_errorhandler(
2831 errors, &errorHandler,
2832 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002833 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002834 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002835 goto onError;
2836 }
2837 else {
2838 *p++ = '\\';
2839 *p++ = (unsigned char)s[-1];
2840 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002841 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002842 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002843 nextByte:
2844 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002846 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002847 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002848 Py_XDECREF(errorHandler);
2849 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002851
Fredrik Lundhccc74732001-02-18 22:13:49 +00002852ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002853 PyErr_SetString(
2854 PyExc_UnicodeError,
2855 "\\N escapes not supported (can't load unicodedata module)"
2856 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002857 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002858 Py_XDECREF(errorHandler);
2859 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002860 return NULL;
2861
Fredrik Lundhccc74732001-02-18 22:13:49 +00002862onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002864 Py_XDECREF(errorHandler);
2865 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 return NULL;
2867}
2868
2869/* Return a Unicode-Escape string version of the Unicode object.
2870
2871 If quotes is true, the string is enclosed in u"" or u'' quotes as
2872 appropriate.
2873
2874*/
2875
Thomas Wouters477c8d52006-05-27 19:21:47 +00002876Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2877 Py_ssize_t size,
2878 Py_UNICODE ch)
2879{
2880 /* like wcschr, but doesn't stop at NULL characters */
2881
2882 while (size-- > 0) {
2883 if (*s == ch)
2884 return s;
2885 s++;
2886 }
2887
2888 return NULL;
2889}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002890
Walter Dörwald79e913e2007-05-12 11:08:06 +00002891static const char *hexdigits = "0123456789abcdef";
2892
2893PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2894 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895{
2896 PyObject *repr;
2897 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898
Thomas Wouters89f507f2006-12-13 04:49:30 +00002899 /* XXX(nnorwitz): rather than over-allocating, it would be
2900 better to choose a different scheme. Perhaps scan the
2901 first N-chars of the string and allocate based on that size.
2902 */
2903 /* Initial allocation is based on the longest-possible unichr
2904 escape.
2905
2906 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2907 unichr, so in this case it's the longest unichr escape. In
2908 narrow (UTF-16) builds this is five chars per source unichr
2909 since there are two unichrs in the surrogate pair, so in narrow
2910 (UTF-16) builds it's not the longest unichr escape.
2911
2912 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2913 so in the narrow (UTF-16) build case it's the longest unichr
2914 escape.
2915 */
2916
Walter Dörwald79e913e2007-05-12 11:08:06 +00002917 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002918#ifdef Py_UNICODE_WIDE
2919 + 10*size
2920#else
2921 + 6*size
2922#endif
2923 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924 if (repr == NULL)
2925 return NULL;
2926
Walter Dörwald79e913e2007-05-12 11:08:06 +00002927 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928
Guido van Rossumd57fd912000-03-10 22:53:23 +00002929 while (size-- > 0) {
2930 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002931
Walter Dörwald79e913e2007-05-12 11:08:06 +00002932 /* Escape backslashes */
2933 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 *p++ = '\\';
2935 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002936 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002937 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002938
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002939#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002940 /* Map 21-bit characters to '\U00xxxxxx' */
2941 else if (ch >= 0x10000) {
2942 *p++ = '\\';
2943 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002944 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2945 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2946 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2947 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2948 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2949 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2950 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2951 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002952 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002953 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002954#else
2955 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002956 else if (ch >= 0xD800 && ch < 0xDC00) {
2957 Py_UNICODE ch2;
2958 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002959
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002960 ch2 = *s++;
2961 size--;
2962 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2963 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2964 *p++ = '\\';
2965 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002966 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2967 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2968 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2969 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2970 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2971 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2972 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2973 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002974 continue;
2975 }
2976 /* Fall through: isolated surrogates are copied as-is */
2977 s--;
2978 size++;
2979 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002980#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002981
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002983 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984 *p++ = '\\';
2985 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002986 *p++ = hexdigits[(ch >> 12) & 0x000F];
2987 *p++ = hexdigits[(ch >> 8) & 0x000F];
2988 *p++ = hexdigits[(ch >> 4) & 0x000F];
2989 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002991
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002992 /* Map special whitespace to '\t', \n', '\r' */
2993 else if (ch == '\t') {
2994 *p++ = '\\';
2995 *p++ = 't';
2996 }
2997 else if (ch == '\n') {
2998 *p++ = '\\';
2999 *p++ = 'n';
3000 }
3001 else if (ch == '\r') {
3002 *p++ = '\\';
3003 *p++ = 'r';
3004 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003005
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003006 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003007 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003009 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003010 *p++ = hexdigits[(ch >> 4) & 0x000F];
3011 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003012 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003013
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014 /* Copy everything else as-is */
3015 else
3016 *p++ = (char) ch;
3017 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018
3019 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003020 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
3021 Py_DECREF(repr);
3022 return NULL;
3023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 return repr;
3025}
3026
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3028{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003029 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 if (!PyUnicode_Check(unicode)) {
3031 PyErr_BadArgument();
3032 return NULL;
3033 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003034 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3035 PyUnicode_GET_SIZE(unicode));
3036
3037 if (!s)
3038 return NULL;
3039 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3040 PyBytes_GET_SIZE(s));
3041 Py_DECREF(s);
3042 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043}
3044
3045/* --- Raw Unicode Escape Codec ------------------------------------------- */
3046
3047PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003048 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049 const char *errors)
3050{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003052 Py_ssize_t startinpos;
3053 Py_ssize_t endinpos;
3054 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003056 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 const char *end;
3058 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 PyObject *errorHandler = NULL;
3060 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003061
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062 /* Escaped strings will always be longer than the resulting
3063 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 length after conversion to the true value. (But decoding error
3065 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066 v = _PyUnicode_New(size);
3067 if (v == NULL)
3068 goto onError;
3069 if (size == 0)
3070 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003071 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 end = s + size;
3073 while (s < end) {
3074 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003075 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003077 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078
3079 /* Non-escape characters are interpreted as Unicode ordinals */
3080 if (*s != '\\') {
3081 *p++ = (unsigned char)*s++;
3082 continue;
3083 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003084 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085
3086 /* \u-escapes are only interpreted iff the number of leading
3087 backslashes if odd */
3088 bs = s;
3089 for (;s < end;) {
3090 if (*s != '\\')
3091 break;
3092 *p++ = (unsigned char)*s++;
3093 }
3094 if (((s - bs) & 1) == 0 ||
3095 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003096 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097 continue;
3098 }
3099 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003100 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101 s++;
3102
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003103 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003104 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003105 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003106 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003107 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108 endinpos = s-starts;
3109 if (unicode_decode_call_errorhandler(
3110 errors, &errorHandler,
3111 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003112 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003113 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003115 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 }
3117 x = (x<<4) & ~0xF;
3118 if (c >= '0' && c <= '9')
3119 x += c - '0';
3120 else if (c >= 'a' && c <= 'f')
3121 x += 10 + c - 'a';
3122 else
3123 x += 10 + c - 'A';
3124 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003125#ifndef Py_UNICODE_WIDE
3126 if (x > 0x10000) {
3127 if (unicode_decode_call_errorhandler(
3128 errors, &errorHandler,
3129 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003130 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003131 (PyObject **)&v, &outpos, &p))
3132 goto onError;
3133 }
3134#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003135 *p++ = x;
3136 nextByte:
3137 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003138 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003139 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003140 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003141 Py_XDECREF(errorHandler);
3142 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003144
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145 onError:
3146 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003147 Py_XDECREF(errorHandler);
3148 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149 return NULL;
3150}
3151
3152PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003153 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154{
3155 PyObject *repr;
3156 char *p;
3157 char *q;
3158
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003159#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003160 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003161#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003162 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003163#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164 if (repr == NULL)
3165 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003166 if (size == 0)
3167 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168
Walter Dörwald711005d2007-05-12 12:03:26 +00003169 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 while (size-- > 0) {
3171 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003172#ifdef Py_UNICODE_WIDE
3173 /* Map 32-bit characters to '\Uxxxxxxxx' */
3174 if (ch >= 0x10000) {
3175 *p++ = '\\';
3176 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003177 *p++ = hexdigits[(ch >> 28) & 0xf];
3178 *p++ = hexdigits[(ch >> 24) & 0xf];
3179 *p++ = hexdigits[(ch >> 20) & 0xf];
3180 *p++ = hexdigits[(ch >> 16) & 0xf];
3181 *p++ = hexdigits[(ch >> 12) & 0xf];
3182 *p++ = hexdigits[(ch >> 8) & 0xf];
3183 *p++ = hexdigits[(ch >> 4) & 0xf];
3184 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003185 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003186 else
3187#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188 /* Map 16-bit characters to '\uxxxx' */
3189 if (ch >= 256) {
3190 *p++ = '\\';
3191 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003192 *p++ = hexdigits[(ch >> 12) & 0xf];
3193 *p++ = hexdigits[(ch >> 8) & 0xf];
3194 *p++ = hexdigits[(ch >> 4) & 0xf];
3195 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 }
3197 /* Copy everything else as-is */
3198 else
3199 *p++ = (char) ch;
3200 }
3201 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003202 if (PyBytes_Resize(repr, p - q)) {
3203 Py_DECREF(repr);
3204 return NULL;
3205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 return repr;
3207}
3208
3209PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3210{
Walter Dörwald711005d2007-05-12 12:03:26 +00003211 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003213 PyErr_BadArgument();
3214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003216 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3217 PyUnicode_GET_SIZE(unicode));
3218
3219 if (!s)
3220 return NULL;
3221 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3222 PyBytes_GET_SIZE(s));
3223 Py_DECREF(s);
3224 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225}
3226
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003227/* --- Unicode Internal Codec ------------------------------------------- */
3228
3229PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003230 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003231 const char *errors)
3232{
3233 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003234 Py_ssize_t startinpos;
3235 Py_ssize_t endinpos;
3236 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003237 PyUnicodeObject *v;
3238 Py_UNICODE *p;
3239 const char *end;
3240 const char *reason;
3241 PyObject *errorHandler = NULL;
3242 PyObject *exc = NULL;
3243
Neal Norwitzd43069c2006-01-08 01:12:10 +00003244#ifdef Py_UNICODE_WIDE
3245 Py_UNICODE unimax = PyUnicode_GetMax();
3246#endif
3247
Thomas Wouters89f507f2006-12-13 04:49:30 +00003248 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003249 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3250 if (v == NULL)
3251 goto onError;
3252 if (PyUnicode_GetSize((PyObject *)v) == 0)
3253 return (PyObject *)v;
3254 p = PyUnicode_AS_UNICODE(v);
3255 end = s + size;
3256
3257 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003258 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003259 /* We have to sanity check the raw data, otherwise doom looms for
3260 some malformed UCS-4 data. */
3261 if (
3262 #ifdef Py_UNICODE_WIDE
3263 *p > unimax || *p < 0 ||
3264 #endif
3265 end-s < Py_UNICODE_SIZE
3266 )
3267 {
3268 startinpos = s - starts;
3269 if (end-s < Py_UNICODE_SIZE) {
3270 endinpos = end-starts;
3271 reason = "truncated input";
3272 }
3273 else {
3274 endinpos = s - starts + Py_UNICODE_SIZE;
3275 reason = "illegal code point (> 0x10FFFF)";
3276 }
3277 outpos = p - PyUnicode_AS_UNICODE(v);
3278 if (unicode_decode_call_errorhandler(
3279 errors, &errorHandler,
3280 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003281 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003282 (PyObject **)&v, &outpos, &p)) {
3283 goto onError;
3284 }
3285 }
3286 else {
3287 p++;
3288 s += Py_UNICODE_SIZE;
3289 }
3290 }
3291
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003292 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003293 goto onError;
3294 Py_XDECREF(errorHandler);
3295 Py_XDECREF(exc);
3296 return (PyObject *)v;
3297
3298 onError:
3299 Py_XDECREF(v);
3300 Py_XDECREF(errorHandler);
3301 Py_XDECREF(exc);
3302 return NULL;
3303}
3304
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305/* --- Latin-1 Codec ------------------------------------------------------ */
3306
3307PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003308 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 const char *errors)
3310{
3311 PyUnicodeObject *v;
3312 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003313
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003315 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003316 Py_UNICODE r = *(unsigned char*)s;
3317 return PyUnicode_FromUnicode(&r, 1);
3318 }
3319
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320 v = _PyUnicode_New(size);
3321 if (v == NULL)
3322 goto onError;
3323 if (size == 0)
3324 return (PyObject *)v;
3325 p = PyUnicode_AS_UNICODE(v);
3326 while (size-- > 0)
3327 *p++ = (unsigned char)*s++;
3328 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003329
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 onError:
3331 Py_XDECREF(v);
3332 return NULL;
3333}
3334
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335/* create or adjust a UnicodeEncodeError */
3336static void make_encode_exception(PyObject **exceptionObject,
3337 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003338 const Py_UNICODE *unicode, Py_ssize_t size,
3339 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003340 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 if (*exceptionObject == NULL) {
3343 *exceptionObject = PyUnicodeEncodeError_Create(
3344 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 }
3346 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003347 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3348 goto onError;
3349 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3350 goto onError;
3351 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3352 goto onError;
3353 return;
3354 onError:
3355 Py_DECREF(*exceptionObject);
3356 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 }
3358}
3359
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003360/* raises a UnicodeEncodeError */
3361static void raise_encode_exception(PyObject **exceptionObject,
3362 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003363 const Py_UNICODE *unicode, Py_ssize_t size,
3364 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003365 const char *reason)
3366{
3367 make_encode_exception(exceptionObject,
3368 encoding, unicode, size, startpos, endpos, reason);
3369 if (*exceptionObject != NULL)
3370 PyCodec_StrictErrors(*exceptionObject);
3371}
3372
3373/* error handling callback helper:
3374 build arguments, call the callback and check the arguments,
3375 put the result into newpos and return the replacement string, which
3376 has to be freed by the caller */
3377static PyObject *unicode_encode_call_errorhandler(const char *errors,
3378 PyObject **errorHandler,
3379 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003380 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3381 Py_ssize_t startpos, Py_ssize_t endpos,
3382 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003383{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003384 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003385
3386 PyObject *restuple;
3387 PyObject *resunicode;
3388
3389 if (*errorHandler == NULL) {
3390 *errorHandler = PyCodec_LookupError(errors);
3391 if (*errorHandler == NULL)
3392 return NULL;
3393 }
3394
3395 make_encode_exception(exceptionObject,
3396 encoding, unicode, size, startpos, endpos, reason);
3397 if (*exceptionObject == NULL)
3398 return NULL;
3399
3400 restuple = PyObject_CallFunctionObjArgs(
3401 *errorHandler, *exceptionObject, NULL);
3402 if (restuple == NULL)
3403 return NULL;
3404 if (!PyTuple_Check(restuple)) {
3405 PyErr_Format(PyExc_TypeError, &argparse[4]);
3406 Py_DECREF(restuple);
3407 return NULL;
3408 }
3409 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3410 &resunicode, newpos)) {
3411 Py_DECREF(restuple);
3412 return NULL;
3413 }
3414 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003415 *newpos = size+*newpos;
3416 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003417 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003418 Py_DECREF(restuple);
3419 return NULL;
3420 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421 Py_INCREF(resunicode);
3422 Py_DECREF(restuple);
3423 return resunicode;
3424}
3425
3426static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003427 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 const char *errors,
3429 int limit)
3430{
3431 /* output object */
3432 PyObject *res;
3433 /* pointers to the beginning and end+1 of input */
3434 const Py_UNICODE *startp = p;
3435 const Py_UNICODE *endp = p + size;
3436 /* pointer to the beginning of the unencodable characters */
3437 /* const Py_UNICODE *badp = NULL; */
3438 /* pointer into the output */
3439 char *str;
3440 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003441 Py_ssize_t respos = 0;
3442 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003443 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3444 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 PyObject *errorHandler = NULL;
3446 PyObject *exc = NULL;
3447 /* the following variable is used for caching string comparisons
3448 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3449 int known_errorHandler = -1;
3450
3451 /* allocate enough for a simple encoding without
3452 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003453 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454 if (res == NULL)
3455 goto onError;
3456 if (size == 0)
3457 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003458 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003459 ressize = size;
3460
3461 while (p<endp) {
3462 Py_UNICODE c = *p;
3463
3464 /* can we encode this? */
3465 if (c<limit) {
3466 /* no overflow check, because we know that the space is enough */
3467 *str++ = (char)c;
3468 ++p;
3469 }
3470 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003471 Py_ssize_t unicodepos = p-startp;
3472 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003474 Py_ssize_t repsize;
3475 Py_ssize_t newpos;
3476 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003477 Py_UNICODE *uni2;
3478 /* startpos for collecting unencodable chars */
3479 const Py_UNICODE *collstart = p;
3480 const Py_UNICODE *collend = p;
3481 /* find all unecodable characters */
3482 while ((collend < endp) && ((*collend)>=limit))
3483 ++collend;
3484 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3485 if (known_errorHandler==-1) {
3486 if ((errors==NULL) || (!strcmp(errors, "strict")))
3487 known_errorHandler = 1;
3488 else if (!strcmp(errors, "replace"))
3489 known_errorHandler = 2;
3490 else if (!strcmp(errors, "ignore"))
3491 known_errorHandler = 3;
3492 else if (!strcmp(errors, "xmlcharrefreplace"))
3493 known_errorHandler = 4;
3494 else
3495 known_errorHandler = 0;
3496 }
3497 switch (known_errorHandler) {
3498 case 1: /* strict */
3499 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3500 goto onError;
3501 case 2: /* replace */
3502 while (collstart++<collend)
3503 *str++ = '?'; /* fall through */
3504 case 3: /* ignore */
3505 p = collend;
3506 break;
3507 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003508 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509 /* determine replacement size (temporarily (mis)uses p) */
3510 for (p = collstart, repsize = 0; p < collend; ++p) {
3511 if (*p<10)
3512 repsize += 2+1+1;
3513 else if (*p<100)
3514 repsize += 2+2+1;
3515 else if (*p<1000)
3516 repsize += 2+3+1;
3517 else if (*p<10000)
3518 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003519#ifndef Py_UNICODE_WIDE
3520 else
3521 repsize += 2+5+1;
3522#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523 else if (*p<100000)
3524 repsize += 2+5+1;
3525 else if (*p<1000000)
3526 repsize += 2+6+1;
3527 else
3528 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003529#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 }
3531 requiredsize = respos+repsize+(endp-collend);
3532 if (requiredsize > ressize) {
3533 if (requiredsize<2*ressize)
3534 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003535 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003537 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 ressize = requiredsize;
3539 }
3540 /* generate replacement (temporarily (mis)uses p) */
3541 for (p = collstart; p < collend; ++p) {
3542 str += sprintf(str, "&#%d;", (int)*p);
3543 }
3544 p = collend;
3545 break;
3546 default:
3547 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3548 encoding, reason, startp, size, &exc,
3549 collstart-startp, collend-startp, &newpos);
3550 if (repunicode == NULL)
3551 goto onError;
3552 /* need more space? (at least enough for what we
3553 have+the replacement+the rest of the string, so
3554 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003555 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 repsize = PyUnicode_GET_SIZE(repunicode);
3557 requiredsize = respos+repsize+(endp-collend);
3558 if (requiredsize > ressize) {
3559 if (requiredsize<2*ressize)
3560 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003561 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 Py_DECREF(repunicode);
3563 goto onError;
3564 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003565 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 ressize = requiredsize;
3567 }
3568 /* check if there is anything unencodable in the replacement
3569 and copy it to the output */
3570 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3571 c = *uni2;
3572 if (c >= limit) {
3573 raise_encode_exception(&exc, encoding, startp, size,
3574 unicodepos, unicodepos+1, reason);
3575 Py_DECREF(repunicode);
3576 goto onError;
3577 }
3578 *str = (char)c;
3579 }
3580 p = startp + newpos;
3581 Py_DECREF(repunicode);
3582 }
3583 }
3584 }
3585 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003586 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 if (respos<ressize)
3588 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003589 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 Py_XDECREF(errorHandler);
3591 Py_XDECREF(exc);
3592 return res;
3593
3594 onError:
3595 Py_XDECREF(res);
3596 Py_XDECREF(errorHandler);
3597 Py_XDECREF(exc);
3598 return NULL;
3599}
3600
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003602 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603 const char *errors)
3604{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606}
3607
3608PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3609{
3610 if (!PyUnicode_Check(unicode)) {
3611 PyErr_BadArgument();
3612 return NULL;
3613 }
3614 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3615 PyUnicode_GET_SIZE(unicode),
3616 NULL);
3617}
3618
3619/* --- 7-bit ASCII Codec -------------------------------------------------- */
3620
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003622 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 const char *errors)
3624{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003625 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 PyUnicodeObject *v;
3627 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003628 Py_ssize_t startinpos;
3629 Py_ssize_t endinpos;
3630 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 const char *e;
3632 PyObject *errorHandler = NULL;
3633 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003634
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003636 if (size == 1 && *(unsigned char*)s < 128) {
3637 Py_UNICODE r = *(unsigned char*)s;
3638 return PyUnicode_FromUnicode(&r, 1);
3639 }
Tim Petersced69f82003-09-16 20:30:58 +00003640
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 v = _PyUnicode_New(size);
3642 if (v == NULL)
3643 goto onError;
3644 if (size == 0)
3645 return (PyObject *)v;
3646 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003647 e = s + size;
3648 while (s < e) {
3649 register unsigned char c = (unsigned char)*s;
3650 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003652 ++s;
3653 }
3654 else {
3655 startinpos = s-starts;
3656 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003657 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 if (unicode_decode_call_errorhandler(
3659 errors, &errorHandler,
3660 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003661 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003666 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003667 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003668 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 Py_XDECREF(errorHandler);
3670 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003672
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 onError:
3674 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 Py_XDECREF(errorHandler);
3676 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 return NULL;
3678}
3679
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003681 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 const char *errors)
3683{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685}
3686
3687PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3688{
3689 if (!PyUnicode_Check(unicode)) {
3690 PyErr_BadArgument();
3691 return NULL;
3692 }
3693 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3694 PyUnicode_GET_SIZE(unicode),
3695 NULL);
3696}
3697
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003698#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003699
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003700/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003701
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003702#if SIZEOF_INT < SIZEOF_SSIZE_T
3703#define NEED_RETRY
3704#endif
3705
3706/* XXX This code is limited to "true" double-byte encodings, as
3707 a) it assumes an incomplete character consists of a single byte, and
3708 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3709 encodings, see IsDBCSLeadByteEx documentation. */
3710
3711static int is_dbcs_lead_byte(const char *s, int offset)
3712{
3713 const char *curr = s + offset;
3714
3715 if (IsDBCSLeadByte(*curr)) {
3716 const char *prev = CharPrev(s, curr);
3717 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3718 }
3719 return 0;
3720}
3721
3722/*
3723 * Decode MBCS string into unicode object. If 'final' is set, converts
3724 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3725 */
3726static int decode_mbcs(PyUnicodeObject **v,
3727 const char *s, /* MBCS string */
3728 int size, /* sizeof MBCS string */
3729 int final)
3730{
3731 Py_UNICODE *p;
3732 Py_ssize_t n = 0;
3733 int usize = 0;
3734
3735 assert(size >= 0);
3736
3737 /* Skip trailing lead-byte unless 'final' is set */
3738 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3739 --size;
3740
3741 /* First get the size of the result */
3742 if (size > 0) {
3743 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3744 if (usize == 0) {
3745 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3746 return -1;
3747 }
3748 }
3749
3750 if (*v == NULL) {
3751 /* Create unicode object */
3752 *v = _PyUnicode_New(usize);
3753 if (*v == NULL)
3754 return -1;
3755 }
3756 else {
3757 /* Extend unicode object */
3758 n = PyUnicode_GET_SIZE(*v);
3759 if (_PyUnicode_Resize(v, n + usize) < 0)
3760 return -1;
3761 }
3762
3763 /* Do the conversion */
3764 if (size > 0) {
3765 p = PyUnicode_AS_UNICODE(*v) + n;
3766 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3767 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3768 return -1;
3769 }
3770 }
3771
3772 return size;
3773}
3774
3775PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3776 Py_ssize_t size,
3777 const char *errors,
3778 Py_ssize_t *consumed)
3779{
3780 PyUnicodeObject *v = NULL;
3781 int done;
3782
3783 if (consumed)
3784 *consumed = 0;
3785
3786#ifdef NEED_RETRY
3787 retry:
3788 if (size > INT_MAX)
3789 done = decode_mbcs(&v, s, INT_MAX, 0);
3790 else
3791#endif
3792 done = decode_mbcs(&v, s, (int)size, !consumed);
3793
3794 if (done < 0) {
3795 Py_XDECREF(v);
3796 return NULL;
3797 }
3798
3799 if (consumed)
3800 *consumed += done;
3801
3802#ifdef NEED_RETRY
3803 if (size > INT_MAX) {
3804 s += done;
3805 size -= done;
3806 goto retry;
3807 }
3808#endif
3809
3810 return (PyObject *)v;
3811}
3812
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003813PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003814 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003815 const char *errors)
3816{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003817 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3818}
3819
3820/*
3821 * Convert unicode into string object (MBCS).
3822 * Returns 0 if succeed, -1 otherwise.
3823 */
3824static int encode_mbcs(PyObject **repr,
3825 const Py_UNICODE *p, /* unicode */
3826 int size) /* size of unicode */
3827{
3828 int mbcssize = 0;
3829 Py_ssize_t n = 0;
3830
3831 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003832
3833 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003834 if (size > 0) {
3835 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3836 if (mbcssize == 0) {
3837 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3838 return -1;
3839 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003840 }
3841
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003842 if (*repr == NULL) {
3843 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003844 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003845 if (*repr == NULL)
3846 return -1;
3847 }
3848 else {
3849 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003850 n = PyBytes_Size(*repr);
3851 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003852 return -1;
3853 }
3854
3855 /* Do the conversion */
3856 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003857 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003858 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3859 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3860 return -1;
3861 }
3862 }
3863
3864 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003865}
3866
3867PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003868 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003869 const char *errors)
3870{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003871 PyObject *repr = NULL;
3872 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003873
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003874#ifdef NEED_RETRY
3875 retry:
3876 if (size > INT_MAX)
3877 ret = encode_mbcs(&repr, p, INT_MAX);
3878 else
3879#endif
3880 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003881
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003882 if (ret < 0) {
3883 Py_XDECREF(repr);
3884 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003885 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003886
3887#ifdef NEED_RETRY
3888 if (size > INT_MAX) {
3889 p += INT_MAX;
3890 size -= INT_MAX;
3891 goto retry;
3892 }
3893#endif
3894
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003895 return repr;
3896}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003897
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003898PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3899{
3900 if (!PyUnicode_Check(unicode)) {
3901 PyErr_BadArgument();
3902 return NULL;
3903 }
3904 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3905 PyUnicode_GET_SIZE(unicode),
3906 NULL);
3907}
3908
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003909#undef NEED_RETRY
3910
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003911#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003912
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913/* --- Character Mapping Codec -------------------------------------------- */
3914
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003916 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 PyObject *mapping,
3918 const char *errors)
3919{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003920 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003921 Py_ssize_t startinpos;
3922 Py_ssize_t endinpos;
3923 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 PyUnicodeObject *v;
3926 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003927 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003928 PyObject *errorHandler = NULL;
3929 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003930 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003931 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003932
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933 /* Default to Latin-1 */
3934 if (mapping == NULL)
3935 return PyUnicode_DecodeLatin1(s, size, errors);
3936
3937 v = _PyUnicode_New(size);
3938 if (v == NULL)
3939 goto onError;
3940 if (size == 0)
3941 return (PyObject *)v;
3942 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003943 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003944 if (PyUnicode_CheckExact(mapping)) {
3945 mapstring = PyUnicode_AS_UNICODE(mapping);
3946 maplen = PyUnicode_GET_SIZE(mapping);
3947 while (s < e) {
3948 unsigned char ch = *s;
3949 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003951 if (ch < maplen)
3952 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003954 if (x == 0xfffe) {
3955 /* undefined mapping */
3956 outpos = p-PyUnicode_AS_UNICODE(v);
3957 startinpos = s-starts;
3958 endinpos = startinpos+1;
3959 if (unicode_decode_call_errorhandler(
3960 errors, &errorHandler,
3961 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003962 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003963 (PyObject **)&v, &outpos, &p)) {
3964 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003965 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003966 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003967 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003968 *p++ = x;
3969 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003971 }
3972 else {
3973 while (s < e) {
3974 unsigned char ch = *s;
3975 PyObject *w, *x;
3976
3977 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3978 w = PyInt_FromLong((long)ch);
3979 if (w == NULL)
3980 goto onError;
3981 x = PyObject_GetItem(mapping, w);
3982 Py_DECREF(w);
3983 if (x == NULL) {
3984 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3985 /* No mapping found means: mapping is undefined. */
3986 PyErr_Clear();
3987 x = Py_None;
3988 Py_INCREF(x);
3989 } else
3990 goto onError;
3991 }
3992
3993 /* Apply mapping */
3994 if (PyInt_Check(x)) {
3995 long value = PyInt_AS_LONG(x);
3996 if (value < 0 || value > 65535) {
3997 PyErr_SetString(PyExc_TypeError,
3998 "character mapping must be in range(65536)");
3999 Py_DECREF(x);
4000 goto onError;
4001 }
4002 *p++ = (Py_UNICODE)value;
4003 }
4004 else if (x == Py_None) {
4005 /* undefined mapping */
4006 outpos = p-PyUnicode_AS_UNICODE(v);
4007 startinpos = s-starts;
4008 endinpos = startinpos+1;
4009 if (unicode_decode_call_errorhandler(
4010 errors, &errorHandler,
4011 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004012 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004013 (PyObject **)&v, &outpos, &p)) {
4014 Py_DECREF(x);
4015 goto onError;
4016 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004017 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004018 continue;
4019 }
4020 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004021 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004022
4023 if (targetsize == 1)
4024 /* 1-1 mapping */
4025 *p++ = *PyUnicode_AS_UNICODE(x);
4026
4027 else if (targetsize > 1) {
4028 /* 1-n mapping */
4029 if (targetsize > extrachars) {
4030 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004031 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4032 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004033 (targetsize << 2);
4034 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004035 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004036 if (_PyUnicode_Resize(&v,
4037 PyUnicode_GET_SIZE(v) + needed) < 0) {
4038 Py_DECREF(x);
4039 goto onError;
4040 }
4041 p = PyUnicode_AS_UNICODE(v) + oldpos;
4042 }
4043 Py_UNICODE_COPY(p,
4044 PyUnicode_AS_UNICODE(x),
4045 targetsize);
4046 p += targetsize;
4047 extrachars -= targetsize;
4048 }
4049 /* 1-0 mapping: skip the character */
4050 }
4051 else {
4052 /* wrong return value */
4053 PyErr_SetString(PyExc_TypeError,
4054 "character mapping must return integer, None or unicode");
4055 Py_DECREF(x);
4056 goto onError;
4057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004059 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 }
4062 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004063 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 Py_XDECREF(errorHandler);
4066 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004068
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070 Py_XDECREF(errorHandler);
4071 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 Py_XDECREF(v);
4073 return NULL;
4074}
4075
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004076/* Charmap encoding: the lookup table */
4077
4078struct encoding_map{
4079 PyObject_HEAD
4080 unsigned char level1[32];
4081 int count2, count3;
4082 unsigned char level23[1];
4083};
4084
4085static PyObject*
4086encoding_map_size(PyObject *obj, PyObject* args)
4087{
4088 struct encoding_map *map = (struct encoding_map*)obj;
4089 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4090 128*map->count3);
4091}
4092
4093static PyMethodDef encoding_map_methods[] = {
4094 {"size", encoding_map_size, METH_NOARGS,
4095 PyDoc_STR("Return the size (in bytes) of this object") },
4096 { 0 }
4097};
4098
4099static void
4100encoding_map_dealloc(PyObject* o)
4101{
4102 PyObject_FREE(o);
4103}
4104
4105static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004106 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004107 "EncodingMap", /*tp_name*/
4108 sizeof(struct encoding_map), /*tp_basicsize*/
4109 0, /*tp_itemsize*/
4110 /* methods */
4111 encoding_map_dealloc, /*tp_dealloc*/
4112 0, /*tp_print*/
4113 0, /*tp_getattr*/
4114 0, /*tp_setattr*/
4115 0, /*tp_compare*/
4116 0, /*tp_repr*/
4117 0, /*tp_as_number*/
4118 0, /*tp_as_sequence*/
4119 0, /*tp_as_mapping*/
4120 0, /*tp_hash*/
4121 0, /*tp_call*/
4122 0, /*tp_str*/
4123 0, /*tp_getattro*/
4124 0, /*tp_setattro*/
4125 0, /*tp_as_buffer*/
4126 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4127 0, /*tp_doc*/
4128 0, /*tp_traverse*/
4129 0, /*tp_clear*/
4130 0, /*tp_richcompare*/
4131 0, /*tp_weaklistoffset*/
4132 0, /*tp_iter*/
4133 0, /*tp_iternext*/
4134 encoding_map_methods, /*tp_methods*/
4135 0, /*tp_members*/
4136 0, /*tp_getset*/
4137 0, /*tp_base*/
4138 0, /*tp_dict*/
4139 0, /*tp_descr_get*/
4140 0, /*tp_descr_set*/
4141 0, /*tp_dictoffset*/
4142 0, /*tp_init*/
4143 0, /*tp_alloc*/
4144 0, /*tp_new*/
4145 0, /*tp_free*/
4146 0, /*tp_is_gc*/
4147};
4148
4149PyObject*
4150PyUnicode_BuildEncodingMap(PyObject* string)
4151{
4152 Py_UNICODE *decode;
4153 PyObject *result;
4154 struct encoding_map *mresult;
4155 int i;
4156 int need_dict = 0;
4157 unsigned char level1[32];
4158 unsigned char level2[512];
4159 unsigned char *mlevel1, *mlevel2, *mlevel3;
4160 int count2 = 0, count3 = 0;
4161
4162 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4163 PyErr_BadArgument();
4164 return NULL;
4165 }
4166 decode = PyUnicode_AS_UNICODE(string);
4167 memset(level1, 0xFF, sizeof level1);
4168 memset(level2, 0xFF, sizeof level2);
4169
4170 /* If there isn't a one-to-one mapping of NULL to \0,
4171 or if there are non-BMP characters, we need to use
4172 a mapping dictionary. */
4173 if (decode[0] != 0)
4174 need_dict = 1;
4175 for (i = 1; i < 256; i++) {
4176 int l1, l2;
4177 if (decode[i] == 0
4178 #ifdef Py_UNICODE_WIDE
4179 || decode[i] > 0xFFFF
4180 #endif
4181 ) {
4182 need_dict = 1;
4183 break;
4184 }
4185 if (decode[i] == 0xFFFE)
4186 /* unmapped character */
4187 continue;
4188 l1 = decode[i] >> 11;
4189 l2 = decode[i] >> 7;
4190 if (level1[l1] == 0xFF)
4191 level1[l1] = count2++;
4192 if (level2[l2] == 0xFF)
4193 level2[l2] = count3++;
4194 }
4195
4196 if (count2 >= 0xFF || count3 >= 0xFF)
4197 need_dict = 1;
4198
4199 if (need_dict) {
4200 PyObject *result = PyDict_New();
4201 PyObject *key, *value;
4202 if (!result)
4203 return NULL;
4204 for (i = 0; i < 256; i++) {
4205 key = value = NULL;
4206 key = PyInt_FromLong(decode[i]);
4207 value = PyInt_FromLong(i);
4208 if (!key || !value)
4209 goto failed1;
4210 if (PyDict_SetItem(result, key, value) == -1)
4211 goto failed1;
4212 Py_DECREF(key);
4213 Py_DECREF(value);
4214 }
4215 return result;
4216 failed1:
4217 Py_XDECREF(key);
4218 Py_XDECREF(value);
4219 Py_DECREF(result);
4220 return NULL;
4221 }
4222
4223 /* Create a three-level trie */
4224 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4225 16*count2 + 128*count3 - 1);
4226 if (!result)
4227 return PyErr_NoMemory();
4228 PyObject_Init(result, &EncodingMapType);
4229 mresult = (struct encoding_map*)result;
4230 mresult->count2 = count2;
4231 mresult->count3 = count3;
4232 mlevel1 = mresult->level1;
4233 mlevel2 = mresult->level23;
4234 mlevel3 = mresult->level23 + 16*count2;
4235 memcpy(mlevel1, level1, 32);
4236 memset(mlevel2, 0xFF, 16*count2);
4237 memset(mlevel3, 0, 128*count3);
4238 count3 = 0;
4239 for (i = 1; i < 256; i++) {
4240 int o1, o2, o3, i2, i3;
4241 if (decode[i] == 0xFFFE)
4242 /* unmapped character */
4243 continue;
4244 o1 = decode[i]>>11;
4245 o2 = (decode[i]>>7) & 0xF;
4246 i2 = 16*mlevel1[o1] + o2;
4247 if (mlevel2[i2] == 0xFF)
4248 mlevel2[i2] = count3++;
4249 o3 = decode[i] & 0x7F;
4250 i3 = 128*mlevel2[i2] + o3;
4251 mlevel3[i3] = i;
4252 }
4253 return result;
4254}
4255
4256static int
4257encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4258{
4259 struct encoding_map *map = (struct encoding_map*)mapping;
4260 int l1 = c>>11;
4261 int l2 = (c>>7) & 0xF;
4262 int l3 = c & 0x7F;
4263 int i;
4264
4265#ifdef Py_UNICODE_WIDE
4266 if (c > 0xFFFF) {
4267 return -1;
4268 }
4269#endif
4270 if (c == 0)
4271 return 0;
4272 /* level 1*/
4273 i = map->level1[l1];
4274 if (i == 0xFF) {
4275 return -1;
4276 }
4277 /* level 2*/
4278 i = map->level23[16*i+l2];
4279 if (i == 0xFF) {
4280 return -1;
4281 }
4282 /* level 3 */
4283 i = map->level23[16*map->count2 + 128*i + l3];
4284 if (i == 0) {
4285 return -1;
4286 }
4287 return i;
4288}
4289
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290/* Lookup the character ch in the mapping. If the character
4291 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004292 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 PyObject *w = PyInt_FromLong((long)c);
4296 PyObject *x;
4297
4298 if (w == NULL)
4299 return NULL;
4300 x = PyObject_GetItem(mapping, w);
4301 Py_DECREF(w);
4302 if (x == NULL) {
4303 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4304 /* No mapping found means: mapping is undefined. */
4305 PyErr_Clear();
4306 x = Py_None;
4307 Py_INCREF(x);
4308 return x;
4309 } else
4310 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004312 else if (x == Py_None)
4313 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314 else if (PyInt_Check(x)) {
4315 long value = PyInt_AS_LONG(x);
4316 if (value < 0 || value > 255) {
4317 PyErr_SetString(PyExc_TypeError,
4318 "character mapping must be in range(256)");
4319 Py_DECREF(x);
4320 return NULL;
4321 }
4322 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004324 else if (PyString_Check(x))
4325 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004328 PyErr_Format(PyExc_TypeError,
4329 "character mapping must return integer, None or str8, not %.400s",
4330 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 Py_DECREF(x);
4332 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333 }
4334}
4335
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004336static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004337charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004338{
Walter Dörwald827b0552007-05-12 13:23:53 +00004339 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004340 /* exponentially overallocate to minimize reallocations */
4341 if (requiredsize < 2*outsize)
4342 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004343 if (PyBytes_Resize(outobj, requiredsize)) {
4344 Py_DECREF(outobj);
4345 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004346 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004347 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004348}
4349
4350typedef enum charmapencode_result {
4351 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4352}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004354 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355 space is available. Return a new reference to the object that
4356 was put in the output buffer, or Py_None, if the mapping was undefined
4357 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004358 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004359static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004360charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004361 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004362{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004363 PyObject *rep;
4364 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004365 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004367 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004368 int res = encoding_map_lookup(c, mapping);
4369 Py_ssize_t requiredsize = *outpos+1;
4370 if (res == -1)
4371 return enc_FAILED;
4372 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004373 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004374 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004375 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004376 outstart[(*outpos)++] = (char)res;
4377 return enc_SUCCESS;
4378 }
4379
4380 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004382 return enc_EXCEPTION;
4383 else if (rep==Py_None) {
4384 Py_DECREF(rep);
4385 return enc_FAILED;
4386 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004387 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004388 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004389 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004390 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004392 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004394 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4396 }
4397 else {
4398 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004399 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4400 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004401 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004402 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004404 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004406 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 memcpy(outstart + *outpos, repchars, repsize);
4408 *outpos += repsize;
4409 }
4410 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004411 Py_DECREF(rep);
4412 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413}
4414
4415/* handle an error in PyUnicode_EncodeCharmap
4416 Return 0 on success, -1 on error */
4417static
4418int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004419 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004421 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004422 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423{
4424 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004425 Py_ssize_t repsize;
4426 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 Py_UNICODE *uni2;
4428 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004429 Py_ssize_t collstartpos = *inpos;
4430 Py_ssize_t collendpos = *inpos+1;
4431 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 char *encoding = "charmap";
4433 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004434 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 /* find all unencodable characters */
4437 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004438 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004439 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004440 int res = encoding_map_lookup(p[collendpos], mapping);
4441 if (res != -1)
4442 break;
4443 ++collendpos;
4444 continue;
4445 }
4446
4447 rep = charmapencode_lookup(p[collendpos], mapping);
4448 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004450 else if (rep!=Py_None) {
4451 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 break;
4453 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004454 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 ++collendpos;
4456 }
4457 /* cache callback name lookup
4458 * (if not done yet, i.e. it's the first error) */
4459 if (*known_errorHandler==-1) {
4460 if ((errors==NULL) || (!strcmp(errors, "strict")))
4461 *known_errorHandler = 1;
4462 else if (!strcmp(errors, "replace"))
4463 *known_errorHandler = 2;
4464 else if (!strcmp(errors, "ignore"))
4465 *known_errorHandler = 3;
4466 else if (!strcmp(errors, "xmlcharrefreplace"))
4467 *known_errorHandler = 4;
4468 else
4469 *known_errorHandler = 0;
4470 }
4471 switch (*known_errorHandler) {
4472 case 1: /* strict */
4473 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4474 return -1;
4475 case 2: /* replace */
4476 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4477 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004478 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004479 return -1;
4480 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004481 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4483 return -1;
4484 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 }
4486 /* fall through */
4487 case 3: /* ignore */
4488 *inpos = collendpos;
4489 break;
4490 case 4: /* xmlcharrefreplace */
4491 /* generate replacement (temporarily (mis)uses p) */
4492 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4493 char buffer[2+29+1+1];
4494 char *cp;
4495 sprintf(buffer, "&#%d;", (int)p[collpos]);
4496 for (cp = buffer; *cp; ++cp) {
4497 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004498 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004499 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004500 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4502 return -1;
4503 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504 }
4505 }
4506 *inpos = collendpos;
4507 break;
4508 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004509 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 encoding, reason, p, size, exceptionObject,
4511 collstartpos, collendpos, &newpos);
4512 if (repunicode == NULL)
4513 return -1;
4514 /* generate replacement */
4515 repsize = PyUnicode_GET_SIZE(repunicode);
4516 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4517 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004518 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004519 return -1;
4520 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004521 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4524 return -1;
4525 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 }
4527 *inpos = newpos;
4528 Py_DECREF(repunicode);
4529 }
4530 return 0;
4531}
4532
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004534 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535 PyObject *mapping,
4536 const char *errors)
4537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 /* output object */
4539 PyObject *res = NULL;
4540 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004541 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004543 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 PyObject *errorHandler = NULL;
4545 PyObject *exc = NULL;
4546 /* the following variable is used for caching string comparisons
4547 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4548 * 3=ignore, 4=xmlcharrefreplace */
4549 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550
4551 /* Default to Latin-1 */
4552 if (mapping == NULL)
4553 return PyUnicode_EncodeLatin1(p, size, errors);
4554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 /* allocate enough for a simple encoding without
4556 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004557 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 if (res == NULL)
4559 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004560 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 while (inpos<size) {
4564 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004565 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004566 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004568 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 if (charmap_encoding_error(p, size, &inpos, mapping,
4570 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004571 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004572 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004573 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 else
4577 /* done with this character => adjust input position */
4578 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004582 if (respos<PyBytes_GET_SIZE(res)) {
4583 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584 goto onError;
4585 }
4586 Py_XDECREF(exc);
4587 Py_XDECREF(errorHandler);
4588 return res;
4589
4590 onError:
4591 Py_XDECREF(res);
4592 Py_XDECREF(exc);
4593 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594 return NULL;
4595}
4596
4597PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4598 PyObject *mapping)
4599{
4600 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4601 PyErr_BadArgument();
4602 return NULL;
4603 }
4604 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4605 PyUnicode_GET_SIZE(unicode),
4606 mapping,
4607 NULL);
4608}
4609
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610/* create or adjust a UnicodeTranslateError */
4611static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004612 const Py_UNICODE *unicode, Py_ssize_t size,
4613 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 if (*exceptionObject == NULL) {
4617 *exceptionObject = PyUnicodeTranslateError_Create(
4618 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619 }
4620 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4622 goto onError;
4623 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4624 goto onError;
4625 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4626 goto onError;
4627 return;
4628 onError:
4629 Py_DECREF(*exceptionObject);
4630 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004631 }
4632}
4633
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004634/* raises a UnicodeTranslateError */
4635static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004636 const Py_UNICODE *unicode, Py_ssize_t size,
4637 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004638 const char *reason)
4639{
4640 make_translate_exception(exceptionObject,
4641 unicode, size, startpos, endpos, reason);
4642 if (*exceptionObject != NULL)
4643 PyCodec_StrictErrors(*exceptionObject);
4644}
4645
4646/* error handling callback helper:
4647 build arguments, call the callback and check the arguments,
4648 put the result into newpos and return the replacement string, which
4649 has to be freed by the caller */
4650static PyObject *unicode_translate_call_errorhandler(const char *errors,
4651 PyObject **errorHandler,
4652 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004653 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4654 Py_ssize_t startpos, Py_ssize_t endpos,
4655 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004656{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004657 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004658
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004659 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004660 PyObject *restuple;
4661 PyObject *resunicode;
4662
4663 if (*errorHandler == NULL) {
4664 *errorHandler = PyCodec_LookupError(errors);
4665 if (*errorHandler == NULL)
4666 return NULL;
4667 }
4668
4669 make_translate_exception(exceptionObject,
4670 unicode, size, startpos, endpos, reason);
4671 if (*exceptionObject == NULL)
4672 return NULL;
4673
4674 restuple = PyObject_CallFunctionObjArgs(
4675 *errorHandler, *exceptionObject, NULL);
4676 if (restuple == NULL)
4677 return NULL;
4678 if (!PyTuple_Check(restuple)) {
4679 PyErr_Format(PyExc_TypeError, &argparse[4]);
4680 Py_DECREF(restuple);
4681 return NULL;
4682 }
4683 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004684 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004685 Py_DECREF(restuple);
4686 return NULL;
4687 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004688 if (i_newpos<0)
4689 *newpos = size+i_newpos;
4690 else
4691 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004692 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004693 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004694 Py_DECREF(restuple);
4695 return NULL;
4696 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004697 Py_INCREF(resunicode);
4698 Py_DECREF(restuple);
4699 return resunicode;
4700}
4701
4702/* Lookup the character ch in the mapping and put the result in result,
4703 which must be decrefed by the caller.
4704 Return 0 on success, -1 on error */
4705static
4706int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4707{
4708 PyObject *w = PyInt_FromLong((long)c);
4709 PyObject *x;
4710
4711 if (w == NULL)
4712 return -1;
4713 x = PyObject_GetItem(mapping, w);
4714 Py_DECREF(w);
4715 if (x == NULL) {
4716 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4717 /* No mapping found means: use 1:1 mapping. */
4718 PyErr_Clear();
4719 *result = NULL;
4720 return 0;
4721 } else
4722 return -1;
4723 }
4724 else if (x == Py_None) {
4725 *result = x;
4726 return 0;
4727 }
4728 else if (PyInt_Check(x)) {
4729 long value = PyInt_AS_LONG(x);
4730 long max = PyUnicode_GetMax();
4731 if (value < 0 || value > max) {
4732 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004733 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004734 Py_DECREF(x);
4735 return -1;
4736 }
4737 *result = x;
4738 return 0;
4739 }
4740 else if (PyUnicode_Check(x)) {
4741 *result = x;
4742 return 0;
4743 }
4744 else {
4745 /* wrong return value */
4746 PyErr_SetString(PyExc_TypeError,
4747 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004748 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004749 return -1;
4750 }
4751}
4752/* ensure that *outobj is at least requiredsize characters long,
4753if not reallocate and adjust various state variables.
4754Return 0 on success, -1 on error */
4755static
Walter Dörwald4894c302003-10-24 14:25:28 +00004756int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004757 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004759 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004760 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004761 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004762 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004763 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004764 if (requiredsize < 2 * oldsize)
4765 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004766 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767 return -1;
4768 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 }
4770 return 0;
4771}
4772/* lookup the character, put the result in the output string and adjust
4773 various state variables. Return a new reference to the object that
4774 was put in the output buffer in *result, or Py_None, if the mapping was
4775 undefined (in which case no character was written).
4776 The called must decref result.
4777 Return 0 on success, -1 on error. */
4778static
Walter Dörwald4894c302003-10-24 14:25:28 +00004779int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004780 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004781 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782{
Walter Dörwald4894c302003-10-24 14:25:28 +00004783 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004784 return -1;
4785 if (*res==NULL) {
4786 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004787 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004788 }
4789 else if (*res==Py_None)
4790 ;
4791 else if (PyInt_Check(*res)) {
4792 /* no overflow check, because we know that the space is enough */
4793 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4794 }
4795 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004796 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004797 if (repsize==1) {
4798 /* no overflow check, because we know that the space is enough */
4799 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4800 }
4801 else if (repsize!=0) {
4802 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004803 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004804 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004805 repsize - 1;
4806 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 return -1;
4808 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4809 *outp += repsize;
4810 }
4811 }
4812 else
4813 return -1;
4814 return 0;
4815}
4816
4817PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004818 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 PyObject *mapping,
4820 const char *errors)
4821{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 /* output object */
4823 PyObject *res = NULL;
4824 /* pointers to the beginning and end+1 of input */
4825 const Py_UNICODE *startp = p;
4826 const Py_UNICODE *endp = p + size;
4827 /* pointer into the output */
4828 Py_UNICODE *str;
4829 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004830 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004831 char *reason = "character maps to <undefined>";
4832 PyObject *errorHandler = NULL;
4833 PyObject *exc = NULL;
4834 /* the following variable is used for caching string comparisons
4835 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4836 * 3=ignore, 4=xmlcharrefreplace */
4837 int known_errorHandler = -1;
4838
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 if (mapping == NULL) {
4840 PyErr_BadArgument();
4841 return NULL;
4842 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843
4844 /* allocate enough for a simple 1:1 translation without
4845 replacements, if we need more, we'll resize */
4846 res = PyUnicode_FromUnicode(NULL, size);
4847 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004848 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 return res;
4851 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004853 while (p<endp) {
4854 /* try to encode it */
4855 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004856 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004857 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858 goto onError;
4859 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004860 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 if (x!=Py_None) /* it worked => adjust input pointer */
4862 ++p;
4863 else { /* untranslatable character */
4864 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004865 Py_ssize_t repsize;
4866 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867 Py_UNICODE *uni2;
4868 /* startpos for collecting untranslatable chars */
4869 const Py_UNICODE *collstart = p;
4870 const Py_UNICODE *collend = p+1;
4871 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004873 /* find all untranslatable characters */
4874 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004875 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 goto onError;
4877 Py_XDECREF(x);
4878 if (x!=Py_None)
4879 break;
4880 ++collend;
4881 }
4882 /* cache callback name lookup
4883 * (if not done yet, i.e. it's the first error) */
4884 if (known_errorHandler==-1) {
4885 if ((errors==NULL) || (!strcmp(errors, "strict")))
4886 known_errorHandler = 1;
4887 else if (!strcmp(errors, "replace"))
4888 known_errorHandler = 2;
4889 else if (!strcmp(errors, "ignore"))
4890 known_errorHandler = 3;
4891 else if (!strcmp(errors, "xmlcharrefreplace"))
4892 known_errorHandler = 4;
4893 else
4894 known_errorHandler = 0;
4895 }
4896 switch (known_errorHandler) {
4897 case 1: /* strict */
4898 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4899 goto onError;
4900 case 2: /* replace */
4901 /* No need to check for space, this is a 1:1 replacement */
4902 for (coll = collstart; coll<collend; ++coll)
4903 *str++ = '?';
4904 /* fall through */
4905 case 3: /* ignore */
4906 p = collend;
4907 break;
4908 case 4: /* xmlcharrefreplace */
4909 /* generate replacement (temporarily (mis)uses p) */
4910 for (p = collstart; p < collend; ++p) {
4911 char buffer[2+29+1+1];
4912 char *cp;
4913 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004914 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004915 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4916 goto onError;
4917 for (cp = buffer; *cp; ++cp)
4918 *str++ = *cp;
4919 }
4920 p = collend;
4921 break;
4922 default:
4923 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4924 reason, startp, size, &exc,
4925 collstart-startp, collend-startp, &newpos);
4926 if (repunicode == NULL)
4927 goto onError;
4928 /* generate replacement */
4929 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004930 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004931 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4932 Py_DECREF(repunicode);
4933 goto onError;
4934 }
4935 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4936 *str++ = *uni2;
4937 p = startp + newpos;
4938 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 }
4940 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004942 /* Resize if we allocated to much */
4943 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004944 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004945 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004946 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004947 }
4948 Py_XDECREF(exc);
4949 Py_XDECREF(errorHandler);
4950 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004952 onError:
4953 Py_XDECREF(res);
4954 Py_XDECREF(exc);
4955 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 return NULL;
4957}
4958
4959PyObject *PyUnicode_Translate(PyObject *str,
4960 PyObject *mapping,
4961 const char *errors)
4962{
4963 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004964
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 str = PyUnicode_FromObject(str);
4966 if (str == NULL)
4967 goto onError;
4968 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4969 PyUnicode_GET_SIZE(str),
4970 mapping,
4971 errors);
4972 Py_DECREF(str);
4973 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004974
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 onError:
4976 Py_XDECREF(str);
4977 return NULL;
4978}
Tim Petersced69f82003-09-16 20:30:58 +00004979
Guido van Rossum9e896b32000-04-05 20:11:21 +00004980/* --- Decimal Encoder ---------------------------------------------------- */
4981
4982int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004983 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004984 char *output,
4985 const char *errors)
4986{
4987 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004988 PyObject *errorHandler = NULL;
4989 PyObject *exc = NULL;
4990 const char *encoding = "decimal";
4991 const char *reason = "invalid decimal Unicode string";
4992 /* the following variable is used for caching string comparisons
4993 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4994 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004995
4996 if (output == NULL) {
4997 PyErr_BadArgument();
4998 return -1;
4999 }
5000
5001 p = s;
5002 end = s + length;
5003 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005004 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005005 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005006 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005007 Py_ssize_t repsize;
5008 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005009 Py_UNICODE *uni2;
5010 Py_UNICODE *collstart;
5011 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005012
Guido van Rossum9e896b32000-04-05 20:11:21 +00005013 if (Py_UNICODE_ISSPACE(ch)) {
5014 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005015 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005016 continue;
5017 }
5018 decimal = Py_UNICODE_TODECIMAL(ch);
5019 if (decimal >= 0) {
5020 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005021 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005022 continue;
5023 }
Guido van Rossumba477042000-04-06 18:18:10 +00005024 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005025 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005026 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005027 continue;
5028 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005029 /* All other characters are considered unencodable */
5030 collstart = p;
5031 collend = p+1;
5032 while (collend < end) {
5033 if ((0 < *collend && *collend < 256) ||
5034 !Py_UNICODE_ISSPACE(*collend) ||
5035 Py_UNICODE_TODECIMAL(*collend))
5036 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005037 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005038 /* cache callback name lookup
5039 * (if not done yet, i.e. it's the first error) */
5040 if (known_errorHandler==-1) {
5041 if ((errors==NULL) || (!strcmp(errors, "strict")))
5042 known_errorHandler = 1;
5043 else if (!strcmp(errors, "replace"))
5044 known_errorHandler = 2;
5045 else if (!strcmp(errors, "ignore"))
5046 known_errorHandler = 3;
5047 else if (!strcmp(errors, "xmlcharrefreplace"))
5048 known_errorHandler = 4;
5049 else
5050 known_errorHandler = 0;
5051 }
5052 switch (known_errorHandler) {
5053 case 1: /* strict */
5054 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5055 goto onError;
5056 case 2: /* replace */
5057 for (p = collstart; p < collend; ++p)
5058 *output++ = '?';
5059 /* fall through */
5060 case 3: /* ignore */
5061 p = collend;
5062 break;
5063 case 4: /* xmlcharrefreplace */
5064 /* generate replacement (temporarily (mis)uses p) */
5065 for (p = collstart; p < collend; ++p)
5066 output += sprintf(output, "&#%d;", (int)*p);
5067 p = collend;
5068 break;
5069 default:
5070 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5071 encoding, reason, s, length, &exc,
5072 collstart-s, collend-s, &newpos);
5073 if (repunicode == NULL)
5074 goto onError;
5075 /* generate replacement */
5076 repsize = PyUnicode_GET_SIZE(repunicode);
5077 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5078 Py_UNICODE ch = *uni2;
5079 if (Py_UNICODE_ISSPACE(ch))
5080 *output++ = ' ';
5081 else {
5082 decimal = Py_UNICODE_TODECIMAL(ch);
5083 if (decimal >= 0)
5084 *output++ = '0' + decimal;
5085 else if (0 < ch && ch < 256)
5086 *output++ = (char)ch;
5087 else {
5088 Py_DECREF(repunicode);
5089 raise_encode_exception(&exc, encoding,
5090 s, length, collstart-s, collend-s, reason);
5091 goto onError;
5092 }
5093 }
5094 }
5095 p = s + newpos;
5096 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005097 }
5098 }
5099 /* 0-terminate the output string */
5100 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005101 Py_XDECREF(exc);
5102 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005103 return 0;
5104
5105 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005106 Py_XDECREF(exc);
5107 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005108 return -1;
5109}
5110
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111/* --- Helpers ------------------------------------------------------------ */
5112
Eric Smith8c663262007-08-25 02:26:07 +00005113#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005114
5115#include "stringlib/fastsearch.h"
5116
5117#include "stringlib/count.h"
5118#include "stringlib/find.h"
5119#include "stringlib/partition.h"
5120
5121/* helper macro to fixup start/end slice values */
5122#define FIX_START_END(obj) \
5123 if (start < 0) \
5124 start += (obj)->length; \
5125 if (start < 0) \
5126 start = 0; \
5127 if (end > (obj)->length) \
5128 end = (obj)->length; \
5129 if (end < 0) \
5130 end += (obj)->length; \
5131 if (end < 0) \
5132 end = 0;
5133
Martin v. Löwis18e16552006-02-15 17:27:45 +00005134Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005135 PyObject *substr,
5136 Py_ssize_t start,
5137 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005139 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005140 PyUnicodeObject* str_obj;
5141 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005142
Thomas Wouters477c8d52006-05-27 19:21:47 +00005143 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5144 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005146 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5147 if (!sub_obj) {
5148 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 return -1;
5150 }
Tim Petersced69f82003-09-16 20:30:58 +00005151
Thomas Wouters477c8d52006-05-27 19:21:47 +00005152 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005153
Thomas Wouters477c8d52006-05-27 19:21:47 +00005154 result = stringlib_count(
5155 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5156 );
5157
5158 Py_DECREF(sub_obj);
5159 Py_DECREF(str_obj);
5160
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 return result;
5162}
5163
Martin v. Löwis18e16552006-02-15 17:27:45 +00005164Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005165 PyObject *sub,
5166 Py_ssize_t start,
5167 Py_ssize_t end,
5168 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005170 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005171
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005173 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005174 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005175 sub = PyUnicode_FromObject(sub);
5176 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005177 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005178 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 }
Tim Petersced69f82003-09-16 20:30:58 +00005180
Thomas Wouters477c8d52006-05-27 19:21:47 +00005181 if (direction > 0)
5182 result = stringlib_find_slice(
5183 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5184 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5185 start, end
5186 );
5187 else
5188 result = stringlib_rfind_slice(
5189 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5190 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5191 start, end
5192 );
5193
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005195 Py_DECREF(sub);
5196
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 return result;
5198}
5199
Tim Petersced69f82003-09-16 20:30:58 +00005200static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201int tailmatch(PyUnicodeObject *self,
5202 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005203 Py_ssize_t start,
5204 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 int direction)
5206{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 if (substring->length == 0)
5208 return 1;
5209
Thomas Wouters477c8d52006-05-27 19:21:47 +00005210 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211
5212 end -= substring->length;
5213 if (end < start)
5214 return 0;
5215
5216 if (direction > 0) {
5217 if (Py_UNICODE_MATCH(self, end, substring))
5218 return 1;
5219 } else {
5220 if (Py_UNICODE_MATCH(self, start, substring))
5221 return 1;
5222 }
5223
5224 return 0;
5225}
5226
Martin v. Löwis18e16552006-02-15 17:27:45 +00005227Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005229 Py_ssize_t start,
5230 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 int direction)
5232{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005233 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005234
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 str = PyUnicode_FromObject(str);
5236 if (str == NULL)
5237 return -1;
5238 substr = PyUnicode_FromObject(substr);
5239 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005240 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 return -1;
5242 }
Tim Petersced69f82003-09-16 20:30:58 +00005243
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 result = tailmatch((PyUnicodeObject *)str,
5245 (PyUnicodeObject *)substr,
5246 start, end, direction);
5247 Py_DECREF(str);
5248 Py_DECREF(substr);
5249 return result;
5250}
5251
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252/* Apply fixfct filter to the Unicode object self and return a
5253 reference to the modified object */
5254
Tim Petersced69f82003-09-16 20:30:58 +00005255static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256PyObject *fixup(PyUnicodeObject *self,
5257 int (*fixfct)(PyUnicodeObject *s))
5258{
5259
5260 PyUnicodeObject *u;
5261
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005262 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 if (u == NULL)
5264 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005265
5266 Py_UNICODE_COPY(u->str, self->str, self->length);
5267
Tim Peters7a29bd52001-09-12 03:03:31 +00005268 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 /* fixfct should return TRUE if it modified the buffer. If
5270 FALSE, return a reference to the original buffer instead
5271 (to save space, not time) */
5272 Py_INCREF(self);
5273 Py_DECREF(u);
5274 return (PyObject*) self;
5275 }
5276 return (PyObject*) u;
5277}
5278
Tim Petersced69f82003-09-16 20:30:58 +00005279static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280int fixupper(PyUnicodeObject *self)
5281{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005282 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283 Py_UNICODE *s = self->str;
5284 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005285
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 while (len-- > 0) {
5287 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005288
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 ch = Py_UNICODE_TOUPPER(*s);
5290 if (ch != *s) {
5291 status = 1;
5292 *s = ch;
5293 }
5294 s++;
5295 }
5296
5297 return status;
5298}
5299
Tim Petersced69f82003-09-16 20:30:58 +00005300static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301int fixlower(PyUnicodeObject *self)
5302{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005303 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 Py_UNICODE *s = self->str;
5305 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005306
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 while (len-- > 0) {
5308 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005309
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310 ch = Py_UNICODE_TOLOWER(*s);
5311 if (ch != *s) {
5312 status = 1;
5313 *s = ch;
5314 }
5315 s++;
5316 }
5317
5318 return status;
5319}
5320
Tim Petersced69f82003-09-16 20:30:58 +00005321static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322int fixswapcase(PyUnicodeObject *self)
5323{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005324 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 Py_UNICODE *s = self->str;
5326 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005327
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 while (len-- > 0) {
5329 if (Py_UNICODE_ISUPPER(*s)) {
5330 *s = Py_UNICODE_TOLOWER(*s);
5331 status = 1;
5332 } else if (Py_UNICODE_ISLOWER(*s)) {
5333 *s = Py_UNICODE_TOUPPER(*s);
5334 status = 1;
5335 }
5336 s++;
5337 }
5338
5339 return status;
5340}
5341
Tim Petersced69f82003-09-16 20:30:58 +00005342static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343int fixcapitalize(PyUnicodeObject *self)
5344{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005345 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005346 Py_UNICODE *s = self->str;
5347 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005348
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005349 if (len == 0)
5350 return 0;
5351 if (Py_UNICODE_ISLOWER(*s)) {
5352 *s = Py_UNICODE_TOUPPER(*s);
5353 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005355 s++;
5356 while (--len > 0) {
5357 if (Py_UNICODE_ISUPPER(*s)) {
5358 *s = Py_UNICODE_TOLOWER(*s);
5359 status = 1;
5360 }
5361 s++;
5362 }
5363 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364}
5365
5366static
5367int fixtitle(PyUnicodeObject *self)
5368{
5369 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5370 register Py_UNICODE *e;
5371 int previous_is_cased;
5372
5373 /* Shortcut for single character strings */
5374 if (PyUnicode_GET_SIZE(self) == 1) {
5375 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5376 if (*p != ch) {
5377 *p = ch;
5378 return 1;
5379 }
5380 else
5381 return 0;
5382 }
Tim Petersced69f82003-09-16 20:30:58 +00005383
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 e = p + PyUnicode_GET_SIZE(self);
5385 previous_is_cased = 0;
5386 for (; p < e; p++) {
5387 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005388
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 if (previous_is_cased)
5390 *p = Py_UNICODE_TOLOWER(ch);
5391 else
5392 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005393
5394 if (Py_UNICODE_ISLOWER(ch) ||
5395 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 Py_UNICODE_ISTITLE(ch))
5397 previous_is_cased = 1;
5398 else
5399 previous_is_cased = 0;
5400 }
5401 return 1;
5402}
5403
Tim Peters8ce9f162004-08-27 01:49:32 +00005404PyObject *
5405PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406{
Tim Peters8ce9f162004-08-27 01:49:32 +00005407 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005408 const Py_UNICODE blank = ' ';
5409 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005410 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005411 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005412 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5413 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005414 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5415 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005416 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005417 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005418 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419
Tim Peters05eba1f2004-08-27 21:32:02 +00005420 fseq = PySequence_Fast(seq, "");
5421 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005422 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005423 }
5424
Tim Peters91879ab2004-08-27 22:35:44 +00005425 /* Grrrr. A codec may be invoked to convert str objects to
5426 * Unicode, and so it's possible to call back into Python code
5427 * during PyUnicode_FromObject(), and so it's possible for a sick
5428 * codec to change the size of fseq (if seq is a list). Therefore
5429 * we have to keep refetching the size -- can't assume seqlen
5430 * is invariant.
5431 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005432 seqlen = PySequence_Fast_GET_SIZE(fseq);
5433 /* If empty sequence, return u"". */
5434 if (seqlen == 0) {
5435 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5436 goto Done;
5437 }
5438 /* If singleton sequence with an exact Unicode, return that. */
5439 if (seqlen == 1) {
5440 item = PySequence_Fast_GET_ITEM(fseq, 0);
5441 if (PyUnicode_CheckExact(item)) {
5442 Py_INCREF(item);
5443 res = (PyUnicodeObject *)item;
5444 goto Done;
5445 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005446 }
5447
Tim Peters05eba1f2004-08-27 21:32:02 +00005448 /* At least two items to join, or one that isn't exact Unicode. */
5449 if (seqlen > 1) {
5450 /* Set up sep and seplen -- they're needed. */
5451 if (separator == NULL) {
5452 sep = &blank;
5453 seplen = 1;
5454 }
5455 else {
5456 internal_separator = PyUnicode_FromObject(separator);
5457 if (internal_separator == NULL)
5458 goto onError;
5459 sep = PyUnicode_AS_UNICODE(internal_separator);
5460 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005461 /* In case PyUnicode_FromObject() mutated seq. */
5462 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005463 }
5464 }
5465
5466 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005467 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005468 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005469 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005470 res_p = PyUnicode_AS_UNICODE(res);
5471 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005472
Tim Peters05eba1f2004-08-27 21:32:02 +00005473 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005474 Py_ssize_t itemlen;
5475 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005476
5477 item = PySequence_Fast_GET_ITEM(fseq, i);
5478 /* Convert item to Unicode. */
Guido van Rossumf1044292007-09-27 18:01:22 +00005479 if (!PyString_Check(item) && !PyUnicode_Check(item))
5480 {
5481 if (PyBytes_Check(item))
5482 {
5483 PyErr_Format(PyExc_TypeError,
5484 "sequence item %d: join() will not operate on "
5485 "bytes objects", i);
5486 goto onError;
5487 }
5488 item = PyObject_Unicode(item);
Tim Peters8ce9f162004-08-27 01:49:32 +00005489 }
Guido van Rossumf1044292007-09-27 18:01:22 +00005490 else
5491 item = PyUnicode_FromObject(item);
5492
Tim Peters05eba1f2004-08-27 21:32:02 +00005493 if (item == NULL)
5494 goto onError;
5495 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005496
Tim Peters91879ab2004-08-27 22:35:44 +00005497 /* In case PyUnicode_FromObject() mutated seq. */
5498 seqlen = PySequence_Fast_GET_SIZE(fseq);
5499
Tim Peters8ce9f162004-08-27 01:49:32 +00005500 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005502 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005503 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005504 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005505 if (i < seqlen - 1) {
5506 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005507 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005508 goto Overflow;
5509 }
5510 if (new_res_used > res_alloc) {
5511 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005512 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005513 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005514 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005515 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005516 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005517 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005518 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005520 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005521 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005523
5524 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005525 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005526 res_p += itemlen;
5527 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005528 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005529 res_p += seplen;
5530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005532 res_used = new_res_used;
5533 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005534
Tim Peters05eba1f2004-08-27 21:32:02 +00005535 /* Shrink res to match the used area; this probably can't fail,
5536 * but it's cheap to check.
5537 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005538 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005539 goto onError;
5540
5541 Done:
5542 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005543 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544 return (PyObject *)res;
5545
Tim Peters8ce9f162004-08-27 01:49:32 +00005546 Overflow:
5547 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005548 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005549 Py_DECREF(item);
5550 /* fall through */
5551
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005553 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005554 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005555 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 return NULL;
5557}
5558
Tim Petersced69f82003-09-16 20:30:58 +00005559static
5560PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005561 Py_ssize_t left,
5562 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 Py_UNICODE fill)
5564{
5565 PyUnicodeObject *u;
5566
5567 if (left < 0)
5568 left = 0;
5569 if (right < 0)
5570 right = 0;
5571
Tim Peters7a29bd52001-09-12 03:03:31 +00005572 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 Py_INCREF(self);
5574 return self;
5575 }
5576
5577 u = _PyUnicode_New(left + self->length + right);
5578 if (u) {
5579 if (left)
5580 Py_UNICODE_FILL(u->str, fill, left);
5581 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5582 if (right)
5583 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5584 }
5585
5586 return u;
5587}
5588
5589#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005590 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 if (!str) \
5592 goto onError; \
5593 if (PyList_Append(list, str)) { \
5594 Py_DECREF(str); \
5595 goto onError; \
5596 } \
5597 else \
5598 Py_DECREF(str);
5599
5600static
5601PyObject *split_whitespace(PyUnicodeObject *self,
5602 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005603 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005605 register Py_ssize_t i;
5606 register Py_ssize_t j;
5607 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 PyObject *str;
5609
5610 for (i = j = 0; i < len; ) {
5611 /* find a token */
5612 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5613 i++;
5614 j = i;
5615 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5616 i++;
5617 if (j < i) {
5618 if (maxcount-- <= 0)
5619 break;
5620 SPLIT_APPEND(self->str, j, i);
5621 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5622 i++;
5623 j = i;
5624 }
5625 }
5626 if (j < len) {
5627 SPLIT_APPEND(self->str, j, len);
5628 }
5629 return list;
5630
5631 onError:
5632 Py_DECREF(list);
5633 return NULL;
5634}
5635
5636PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005637 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005639 register Py_ssize_t i;
5640 register Py_ssize_t j;
5641 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 PyObject *list;
5643 PyObject *str;
5644 Py_UNICODE *data;
5645
5646 string = PyUnicode_FromObject(string);
5647 if (string == NULL)
5648 return NULL;
5649 data = PyUnicode_AS_UNICODE(string);
5650 len = PyUnicode_GET_SIZE(string);
5651
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 list = PyList_New(0);
5653 if (!list)
5654 goto onError;
5655
5656 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005657 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005658
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005660 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662
5663 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005664 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 if (i < len) {
5666 if (data[i] == '\r' && i + 1 < len &&
5667 data[i+1] == '\n')
5668 i += 2;
5669 else
5670 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005671 if (keepends)
5672 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 }
Guido van Rossum86662912000-04-11 15:38:46 +00005674 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 j = i;
5676 }
5677 if (j < len) {
5678 SPLIT_APPEND(data, j, len);
5679 }
5680
5681 Py_DECREF(string);
5682 return list;
5683
5684 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005685 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 Py_DECREF(string);
5687 return NULL;
5688}
5689
Tim Petersced69f82003-09-16 20:30:58 +00005690static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691PyObject *split_char(PyUnicodeObject *self,
5692 PyObject *list,
5693 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005694 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005696 register Py_ssize_t i;
5697 register Py_ssize_t j;
5698 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 PyObject *str;
5700
5701 for (i = j = 0; i < len; ) {
5702 if (self->str[i] == ch) {
5703 if (maxcount-- <= 0)
5704 break;
5705 SPLIT_APPEND(self->str, j, i);
5706 i = j = i + 1;
5707 } else
5708 i++;
5709 }
5710 if (j <= len) {
5711 SPLIT_APPEND(self->str, j, len);
5712 }
5713 return list;
5714
5715 onError:
5716 Py_DECREF(list);
5717 return NULL;
5718}
5719
Tim Petersced69f82003-09-16 20:30:58 +00005720static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721PyObject *split_substring(PyUnicodeObject *self,
5722 PyObject *list,
5723 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005724 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005726 register Py_ssize_t i;
5727 register Py_ssize_t j;
5728 Py_ssize_t len = self->length;
5729 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 PyObject *str;
5731
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005732 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 if (Py_UNICODE_MATCH(self, i, substring)) {
5734 if (maxcount-- <= 0)
5735 break;
5736 SPLIT_APPEND(self->str, j, i);
5737 i = j = i + sublen;
5738 } else
5739 i++;
5740 }
5741 if (j <= len) {
5742 SPLIT_APPEND(self->str, j, len);
5743 }
5744 return list;
5745
5746 onError:
5747 Py_DECREF(list);
5748 return NULL;
5749}
5750
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005751static
5752PyObject *rsplit_whitespace(PyUnicodeObject *self,
5753 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005754 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005755{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005756 register Py_ssize_t i;
5757 register Py_ssize_t j;
5758 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005759 PyObject *str;
5760
5761 for (i = j = len - 1; i >= 0; ) {
5762 /* find a token */
5763 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5764 i--;
5765 j = i;
5766 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5767 i--;
5768 if (j > i) {
5769 if (maxcount-- <= 0)
5770 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005771 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005772 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5773 i--;
5774 j = i;
5775 }
5776 }
5777 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005778 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005779 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005780 if (PyList_Reverse(list) < 0)
5781 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005782 return list;
5783
5784 onError:
5785 Py_DECREF(list);
5786 return NULL;
5787}
5788
5789static
5790PyObject *rsplit_char(PyUnicodeObject *self,
5791 PyObject *list,
5792 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005793 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005794{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005795 register Py_ssize_t i;
5796 register Py_ssize_t j;
5797 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005798 PyObject *str;
5799
5800 for (i = j = len - 1; i >= 0; ) {
5801 if (self->str[i] == ch) {
5802 if (maxcount-- <= 0)
5803 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005804 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005805 j = i = i - 1;
5806 } else
5807 i--;
5808 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005809 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005810 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005811 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005812 if (PyList_Reverse(list) < 0)
5813 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005814 return list;
5815
5816 onError:
5817 Py_DECREF(list);
5818 return NULL;
5819}
5820
5821static
5822PyObject *rsplit_substring(PyUnicodeObject *self,
5823 PyObject *list,
5824 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005825 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005826{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005827 register Py_ssize_t i;
5828 register Py_ssize_t j;
5829 Py_ssize_t len = self->length;
5830 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005831 PyObject *str;
5832
5833 for (i = len - sublen, j = len; i >= 0; ) {
5834 if (Py_UNICODE_MATCH(self, i, substring)) {
5835 if (maxcount-- <= 0)
5836 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005837 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005838 j = i;
5839 i -= sublen;
5840 } else
5841 i--;
5842 }
5843 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005844 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005845 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005846 if (PyList_Reverse(list) < 0)
5847 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005848 return list;
5849
5850 onError:
5851 Py_DECREF(list);
5852 return NULL;
5853}
5854
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855#undef SPLIT_APPEND
5856
5857static
5858PyObject *split(PyUnicodeObject *self,
5859 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005860 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861{
5862 PyObject *list;
5863
5864 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005865 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866
5867 list = PyList_New(0);
5868 if (!list)
5869 return NULL;
5870
5871 if (substring == NULL)
5872 return split_whitespace(self,list,maxcount);
5873
5874 else if (substring->length == 1)
5875 return split_char(self,list,substring->str[0],maxcount);
5876
5877 else if (substring->length == 0) {
5878 Py_DECREF(list);
5879 PyErr_SetString(PyExc_ValueError, "empty separator");
5880 return NULL;
5881 }
5882 else
5883 return split_substring(self,list,substring,maxcount);
5884}
5885
Tim Petersced69f82003-09-16 20:30:58 +00005886static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005887PyObject *rsplit(PyUnicodeObject *self,
5888 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005889 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005890{
5891 PyObject *list;
5892
5893 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005894 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005895
5896 list = PyList_New(0);
5897 if (!list)
5898 return NULL;
5899
5900 if (substring == NULL)
5901 return rsplit_whitespace(self,list,maxcount);
5902
5903 else if (substring->length == 1)
5904 return rsplit_char(self,list,substring->str[0],maxcount);
5905
5906 else if (substring->length == 0) {
5907 Py_DECREF(list);
5908 PyErr_SetString(PyExc_ValueError, "empty separator");
5909 return NULL;
5910 }
5911 else
5912 return rsplit_substring(self,list,substring,maxcount);
5913}
5914
5915static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916PyObject *replace(PyUnicodeObject *self,
5917 PyUnicodeObject *str1,
5918 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005919 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920{
5921 PyUnicodeObject *u;
5922
5923 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005924 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925
Thomas Wouters477c8d52006-05-27 19:21:47 +00005926 if (str1->length == str2->length) {
5927 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005928 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005929 if (str1->length == 1) {
5930 /* replace characters */
5931 Py_UNICODE u1, u2;
5932 if (!findchar(self->str, self->length, str1->str[0]))
5933 goto nothing;
5934 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5935 if (!u)
5936 return NULL;
5937 Py_UNICODE_COPY(u->str, self->str, self->length);
5938 u1 = str1->str[0];
5939 u2 = str2->str[0];
5940 for (i = 0; i < u->length; i++)
5941 if (u->str[i] == u1) {
5942 if (--maxcount < 0)
5943 break;
5944 u->str[i] = u2;
5945 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005947 i = fastsearch(
5948 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005950 if (i < 0)
5951 goto nothing;
5952 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5953 if (!u)
5954 return NULL;
5955 Py_UNICODE_COPY(u->str, self->str, self->length);
5956 while (i <= self->length - str1->length)
5957 if (Py_UNICODE_MATCH(self, i, str1)) {
5958 if (--maxcount < 0)
5959 break;
5960 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5961 i += str1->length;
5962 } else
5963 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005966
5967 Py_ssize_t n, i, j, e;
5968 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 Py_UNICODE *p;
5970
5971 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005972 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 if (n > maxcount)
5974 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005975 if (n == 0)
5976 goto nothing;
5977 /* new_size = self->length + n * (str2->length - str1->length)); */
5978 delta = (str2->length - str1->length);
5979 if (delta == 0) {
5980 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005982 product = n * (str2->length - str1->length);
5983 if ((product / (str2->length - str1->length)) != n) {
5984 PyErr_SetString(PyExc_OverflowError,
5985 "replace string is too long");
5986 return NULL;
5987 }
5988 new_size = self->length + product;
5989 if (new_size < 0) {
5990 PyErr_SetString(PyExc_OverflowError,
5991 "replace string is too long");
5992 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 }
5994 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005995 u = _PyUnicode_New(new_size);
5996 if (!u)
5997 return NULL;
5998 i = 0;
5999 p = u->str;
6000 e = self->length - str1->length;
6001 if (str1->length > 0) {
6002 while (n-- > 0) {
6003 /* look for next match */
6004 j = i;
6005 while (j <= e) {
6006 if (Py_UNICODE_MATCH(self, j, str1))
6007 break;
6008 j++;
6009 }
6010 if (j > i) {
6011 if (j > e)
6012 break;
6013 /* copy unchanged part [i:j] */
6014 Py_UNICODE_COPY(p, self->str+i, j-i);
6015 p += j - i;
6016 }
6017 /* copy substitution string */
6018 if (str2->length > 0) {
6019 Py_UNICODE_COPY(p, str2->str, str2->length);
6020 p += str2->length;
6021 }
6022 i = j + str1->length;
6023 }
6024 if (i < self->length)
6025 /* copy tail [i:] */
6026 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6027 } else {
6028 /* interleave */
6029 while (n > 0) {
6030 Py_UNICODE_COPY(p, str2->str, str2->length);
6031 p += str2->length;
6032 if (--n <= 0)
6033 break;
6034 *p++ = self->str[i++];
6035 }
6036 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006040
6041nothing:
6042 /* nothing to replace; return original string (when possible) */
6043 if (PyUnicode_CheckExact(self)) {
6044 Py_INCREF(self);
6045 return (PyObject *) self;
6046 }
6047 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048}
6049
6050/* --- Unicode Object Methods --------------------------------------------- */
6051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006052PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053"S.title() -> unicode\n\
6054\n\
6055Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006056characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057
6058static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006059unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 return fixup(self, fixtitle);
6062}
6063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006064PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065"S.capitalize() -> unicode\n\
6066\n\
6067Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006068have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069
6070static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006071unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 return fixup(self, fixcapitalize);
6074}
6075
6076#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006077PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078"S.capwords() -> unicode\n\
6079\n\
6080Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006081normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082
6083static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006084unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085{
6086 PyObject *list;
6087 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006088 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 /* Split into words */
6091 list = split(self, NULL, -1);
6092 if (!list)
6093 return NULL;
6094
6095 /* Capitalize each word */
6096 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6097 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6098 fixcapitalize);
6099 if (item == NULL)
6100 goto onError;
6101 Py_DECREF(PyList_GET_ITEM(list, i));
6102 PyList_SET_ITEM(list, i, item);
6103 }
6104
6105 /* Join the words to form a new string */
6106 item = PyUnicode_Join(NULL, list);
6107
6108onError:
6109 Py_DECREF(list);
6110 return (PyObject *)item;
6111}
6112#endif
6113
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006114/* Argument converter. Coerces to a single unicode character */
6115
6116static int
6117convert_uc(PyObject *obj, void *addr)
6118{
6119 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6120 PyObject *uniobj;
6121 Py_UNICODE *unistr;
6122
6123 uniobj = PyUnicode_FromObject(obj);
6124 if (uniobj == NULL) {
6125 PyErr_SetString(PyExc_TypeError,
6126 "The fill character cannot be converted to Unicode");
6127 return 0;
6128 }
6129 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6130 PyErr_SetString(PyExc_TypeError,
6131 "The fill character must be exactly one character long");
6132 Py_DECREF(uniobj);
6133 return 0;
6134 }
6135 unistr = PyUnicode_AS_UNICODE(uniobj);
6136 *fillcharloc = unistr[0];
6137 Py_DECREF(uniobj);
6138 return 1;
6139}
6140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006141PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006142"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006144Return S centered in a Unicode string of length width. Padding is\n\
6145done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146
6147static PyObject *
6148unicode_center(PyUnicodeObject *self, PyObject *args)
6149{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006150 Py_ssize_t marg, left;
6151 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006152 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153
Thomas Woutersde017742006-02-16 19:34:37 +00006154 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 return NULL;
6156
Tim Peters7a29bd52001-09-12 03:03:31 +00006157 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 Py_INCREF(self);
6159 return (PyObject*) self;
6160 }
6161
6162 marg = width - self->length;
6163 left = marg / 2 + (marg & width & 1);
6164
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006165 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166}
6167
Marc-André Lemburge5034372000-08-08 08:04:29 +00006168#if 0
6169
6170/* This code should go into some future Unicode collation support
6171 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006172 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006173
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006174/* speedy UTF-16 code point order comparison */
6175/* gleaned from: */
6176/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6177
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006178static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006179{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006180 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006181 0, 0, 0, 0, 0, 0, 0, 0,
6182 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006183 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006184};
6185
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186static int
6187unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6188{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006189 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006190
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191 Py_UNICODE *s1 = str1->str;
6192 Py_UNICODE *s2 = str2->str;
6193
6194 len1 = str1->length;
6195 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006196
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006198 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006199
6200 c1 = *s1++;
6201 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006202
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006203 if (c1 > (1<<11) * 26)
6204 c1 += utf16Fixup[c1>>11];
6205 if (c2 > (1<<11) * 26)
6206 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006207 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006208
6209 if (c1 != c2)
6210 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006211
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006212 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 }
6214
6215 return (len1 < len2) ? -1 : (len1 != len2);
6216}
6217
Marc-André Lemburge5034372000-08-08 08:04:29 +00006218#else
6219
6220static int
6221unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6222{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006223 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006224
6225 Py_UNICODE *s1 = str1->str;
6226 Py_UNICODE *s2 = str2->str;
6227
6228 len1 = str1->length;
6229 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006230
Marc-André Lemburge5034372000-08-08 08:04:29 +00006231 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006232 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006233
Fredrik Lundh45714e92001-06-26 16:39:36 +00006234 c1 = *s1++;
6235 c2 = *s2++;
6236
6237 if (c1 != c2)
6238 return (c1 < c2) ? -1 : 1;
6239
Marc-André Lemburge5034372000-08-08 08:04:29 +00006240 len1--; len2--;
6241 }
6242
6243 return (len1 < len2) ? -1 : (len1 != len2);
6244}
6245
6246#endif
6247
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248int PyUnicode_Compare(PyObject *left,
6249 PyObject *right)
6250{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006251 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6252 return unicode_compare((PyUnicodeObject *)left,
6253 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006254 PyErr_Format(PyExc_TypeError,
6255 "Can't compare %.100s and %.100s",
6256 left->ob_type->tp_name,
6257 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258 return -1;
6259}
6260
Martin v. Löwis5b222132007-06-10 09:51:05 +00006261int
6262PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6263{
6264 int i;
6265 Py_UNICODE *id;
6266 assert(PyUnicode_Check(uni));
6267 id = PyUnicode_AS_UNICODE(uni);
6268 /* Compare Unicode string and source character set string */
6269 for (i = 0; id[i] && str[i]; i++)
6270 if (id[i] != str[i])
6271 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6272 if (id[i])
6273 return 1; /* uni is longer */
6274 if (str[i])
6275 return -1; /* str is longer */
6276 return 0;
6277}
6278
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006279PyObject *PyUnicode_RichCompare(PyObject *left,
6280 PyObject *right,
6281 int op)
6282{
6283 int result;
6284
6285 result = PyUnicode_Compare(left, right);
6286 if (result == -1 && PyErr_Occurred())
6287 goto onError;
6288
6289 /* Convert the return value to a Boolean */
6290 switch (op) {
6291 case Py_EQ:
6292 result = (result == 0);
6293 break;
6294 case Py_NE:
6295 result = (result != 0);
6296 break;
6297 case Py_LE:
6298 result = (result <= 0);
6299 break;
6300 case Py_GE:
6301 result = (result >= 0);
6302 break;
6303 case Py_LT:
6304 result = (result == -1);
6305 break;
6306 case Py_GT:
6307 result = (result == 1);
6308 break;
6309 }
6310 return PyBool_FromLong(result);
6311
6312 onError:
6313
6314 /* Standard case
6315
6316 Type errors mean that PyUnicode_FromObject() could not convert
6317 one of the arguments (usually the right hand side) to Unicode,
6318 ie. we can't handle the comparison request. However, it is
6319 possible that the other object knows a comparison method, which
6320 is why we return Py_NotImplemented to give the other object a
6321 chance.
6322
6323 */
6324 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6325 PyErr_Clear();
6326 Py_INCREF(Py_NotImplemented);
6327 return Py_NotImplemented;
6328 }
6329 if (op != Py_EQ && op != Py_NE)
6330 return NULL;
6331
6332 /* Equality comparison.
6333
6334 This is a special case: we silence any PyExc_UnicodeDecodeError
6335 and instead turn it into a PyErr_UnicodeWarning.
6336
6337 */
6338 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6339 return NULL;
6340 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006341 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6342 (op == Py_EQ) ?
6343 "Unicode equal comparison "
6344 "failed to convert both arguments to Unicode - "
6345 "interpreting them as being unequal"
6346 :
6347 "Unicode unequal comparison "
6348 "failed to convert both arguments to Unicode - "
6349 "interpreting them as being unequal",
6350 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006351 return NULL;
6352 result = (op == Py_NE);
6353 return PyBool_FromLong(result);
6354}
6355
Guido van Rossum403d68b2000-03-13 15:55:09 +00006356int PyUnicode_Contains(PyObject *container,
6357 PyObject *element)
6358{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006359 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006360 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006361
6362 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006363 sub = PyUnicode_FromObject(element);
6364 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006365 PyErr_Format(PyExc_TypeError,
6366 "'in <string>' requires string as left operand, not %s",
6367 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006368 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006369 }
6370
Thomas Wouters477c8d52006-05-27 19:21:47 +00006371 str = PyUnicode_FromObject(container);
6372 if (!str) {
6373 Py_DECREF(sub);
6374 return -1;
6375 }
6376
6377 result = stringlib_contains_obj(str, sub);
6378
6379 Py_DECREF(str);
6380 Py_DECREF(sub);
6381
Guido van Rossum403d68b2000-03-13 15:55:09 +00006382 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006383}
6384
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385/* Concat to string or Unicode object giving a new Unicode object. */
6386
6387PyObject *PyUnicode_Concat(PyObject *left,
6388 PyObject *right)
6389{
6390 PyUnicodeObject *u = NULL, *v = NULL, *w;
6391
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006392 if (PyBytes_Check(left) || PyBytes_Check(right))
6393 return PyBytes_Concat(left, right);
6394
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 /* Coerce the two arguments */
6396 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6397 if (u == NULL)
6398 goto onError;
6399 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6400 if (v == NULL)
6401 goto onError;
6402
6403 /* Shortcuts */
6404 if (v == unicode_empty) {
6405 Py_DECREF(v);
6406 return (PyObject *)u;
6407 }
6408 if (u == unicode_empty) {
6409 Py_DECREF(u);
6410 return (PyObject *)v;
6411 }
6412
6413 /* Concat the two Unicode strings */
6414 w = _PyUnicode_New(u->length + v->length);
6415 if (w == NULL)
6416 goto onError;
6417 Py_UNICODE_COPY(w->str, u->str, u->length);
6418 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6419
6420 Py_DECREF(u);
6421 Py_DECREF(v);
6422 return (PyObject *)w;
6423
6424onError:
6425 Py_XDECREF(u);
6426 Py_XDECREF(v);
6427 return NULL;
6428}
6429
Walter Dörwald1ab83302007-05-18 17:15:44 +00006430void
6431PyUnicode_Append(PyObject **pleft, PyObject *right)
6432{
6433 PyObject *new;
6434 if (*pleft == NULL)
6435 return;
6436 if (right == NULL || !PyUnicode_Check(*pleft)) {
6437 Py_DECREF(*pleft);
6438 *pleft = NULL;
6439 return;
6440 }
6441 new = PyUnicode_Concat(*pleft, right);
6442 Py_DECREF(*pleft);
6443 *pleft = new;
6444}
6445
6446void
6447PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6448{
6449 PyUnicode_Append(pleft, right);
6450 Py_XDECREF(right);
6451}
6452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006453PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454"S.count(sub[, start[, end]]) -> int\n\
6455\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006456Return the number of non-overlapping occurrences of substring sub in\n\
6457Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006458interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459
6460static PyObject *
6461unicode_count(PyUnicodeObject *self, PyObject *args)
6462{
6463 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006464 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006465 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 PyObject *result;
6467
Guido van Rossumb8872e62000-05-09 14:14:27 +00006468 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6469 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 return NULL;
6471
6472 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006473 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474 if (substring == NULL)
6475 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006476
Thomas Wouters477c8d52006-05-27 19:21:47 +00006477 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478
Thomas Wouters477c8d52006-05-27 19:21:47 +00006479 result = PyInt_FromSsize_t(
6480 stringlib_count(self->str + start, end - start,
6481 substring->str, substring->length)
6482 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483
6484 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006485
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 return result;
6487}
6488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006489PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006490"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006492Encodes S using the codec registered for encoding. encoding defaults\n\
6493to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006494handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006495a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6496'xmlcharrefreplace' as well as any other name registered with\n\
6497codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498
6499static PyObject *
6500unicode_encode(PyUnicodeObject *self, PyObject *args)
6501{
6502 char *encoding = NULL;
6503 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006504 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006505
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6507 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006508 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006509 if (v == NULL)
6510 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006511 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006512 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006513 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006514 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006515 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006516 Py_DECREF(v);
6517 return NULL;
6518 }
6519 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006520
6521 onError:
6522 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006523}
6524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006525PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526"S.expandtabs([tabsize]) -> unicode\n\
6527\n\
6528Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006529If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530
6531static PyObject*
6532unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6533{
6534 Py_UNICODE *e;
6535 Py_UNICODE *p;
6536 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006537 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 PyUnicodeObject *u;
6539 int tabsize = 8;
6540
6541 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6542 return NULL;
6543
Thomas Wouters7e474022000-07-16 12:04:32 +00006544 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006545 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 e = self->str + self->length;
6547 for (p = self->str; p < e; p++)
6548 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006549 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006551 if (old_j > j) {
6552 PyErr_SetString(PyExc_OverflowError,
6553 "new string is too long");
6554 return NULL;
6555 }
6556 old_j = j;
6557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 }
6559 else {
6560 j++;
6561 if (*p == '\n' || *p == '\r') {
6562 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006563 old_j = j = 0;
6564 if (i < 0) {
6565 PyErr_SetString(PyExc_OverflowError,
6566 "new string is too long");
6567 return NULL;
6568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 }
6570 }
6571
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006572 if ((i + j) < 0) {
6573 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6574 return NULL;
6575 }
6576
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 /* Second pass: create output string and fill it */
6578 u = _PyUnicode_New(i + j);
6579 if (!u)
6580 return NULL;
6581
6582 j = 0;
6583 q = u->str;
6584
6585 for (p = self->str; p < e; p++)
6586 if (*p == '\t') {
6587 if (tabsize > 0) {
6588 i = tabsize - (j % tabsize);
6589 j += i;
6590 while (i--)
6591 *q++ = ' ';
6592 }
6593 }
6594 else {
6595 j++;
6596 *q++ = *p;
6597 if (*p == '\n' || *p == '\r')
6598 j = 0;
6599 }
6600
6601 return (PyObject*) u;
6602}
6603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006604PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605"S.find(sub [,start [,end]]) -> int\n\
6606\n\
6607Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006608such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609arguments start and end are interpreted as in slice notation.\n\
6610\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006611Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612
6613static PyObject *
6614unicode_find(PyUnicodeObject *self, PyObject *args)
6615{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006616 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006617 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006618 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006619 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620
Guido van Rossumb8872e62000-05-09 14:14:27 +00006621 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6622 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006624 substring = PyUnicode_FromObject(substring);
6625 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626 return NULL;
6627
Thomas Wouters477c8d52006-05-27 19:21:47 +00006628 result = stringlib_find_slice(
6629 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6630 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6631 start, end
6632 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633
6634 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006635
6636 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637}
6638
6639static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006640unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641{
6642 if (index < 0 || index >= self->length) {
6643 PyErr_SetString(PyExc_IndexError, "string index out of range");
6644 return NULL;
6645 }
6646
6647 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6648}
6649
Guido van Rossumc2504932007-09-18 19:42:40 +00006650/* Believe it or not, this produces the same value for ASCII strings
6651 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006653unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654{
Guido van Rossumc2504932007-09-18 19:42:40 +00006655 Py_ssize_t len;
6656 Py_UNICODE *p;
6657 long x;
6658
6659 if (self->hash != -1)
6660 return self->hash;
6661 len = Py_Size(self);
6662 p = self->str;
6663 x = *p << 7;
6664 while (--len >= 0)
6665 x = (1000003*x) ^ *p++;
6666 x ^= Py_Size(self);
6667 if (x == -1)
6668 x = -2;
6669 self->hash = x;
6670 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671}
6672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006673PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674"S.index(sub [,start [,end]]) -> int\n\
6675\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006676Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677
6678static PyObject *
6679unicode_index(PyUnicodeObject *self, PyObject *args)
6680{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006681 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006682 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006683 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006684 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685
Guido van Rossumb8872e62000-05-09 14:14:27 +00006686 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6687 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006689 substring = PyUnicode_FromObject(substring);
6690 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 return NULL;
6692
Thomas Wouters477c8d52006-05-27 19:21:47 +00006693 result = stringlib_find_slice(
6694 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6695 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6696 start, end
6697 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698
6699 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 if (result < 0) {
6702 PyErr_SetString(PyExc_ValueError, "substring not found");
6703 return NULL;
6704 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006705
Martin v. Löwis18e16552006-02-15 17:27:45 +00006706 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707}
6708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006709PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006710"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006712Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006713at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714
6715static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006716unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717{
6718 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6719 register const Py_UNICODE *e;
6720 int cased;
6721
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 /* Shortcut for single character strings */
6723 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006724 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006726 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006727 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006728 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006729
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 e = p + PyUnicode_GET_SIZE(self);
6731 cased = 0;
6732 for (; p < e; p++) {
6733 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006736 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 else if (!cased && Py_UNICODE_ISLOWER(ch))
6738 cased = 1;
6739 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006740 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741}
6742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006743PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006744"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006746Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006747at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748
6749static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006750unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751{
6752 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6753 register const Py_UNICODE *e;
6754 int cased;
6755
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 /* Shortcut for single character strings */
6757 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006758 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006760 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006761 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006762 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006763
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 e = p + PyUnicode_GET_SIZE(self);
6765 cased = 0;
6766 for (; p < e; p++) {
6767 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006768
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006770 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 else if (!cased && Py_UNICODE_ISUPPER(ch))
6772 cased = 1;
6773 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006774 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775}
6776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006777PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006778"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006780Return True if S is a titlecased string and there is at least one\n\
6781character in S, i.e. upper- and titlecase characters may only\n\
6782follow uncased characters and lowercase characters only cased ones.\n\
6783Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784
6785static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006786unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787{
6788 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6789 register const Py_UNICODE *e;
6790 int cased, previous_is_cased;
6791
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 /* Shortcut for single character strings */
6793 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006794 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6795 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006797 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006798 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006799 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006800
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801 e = p + PyUnicode_GET_SIZE(self);
6802 cased = 0;
6803 previous_is_cased = 0;
6804 for (; p < e; p++) {
6805 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006806
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6808 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006809 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 previous_is_cased = 1;
6811 cased = 1;
6812 }
6813 else if (Py_UNICODE_ISLOWER(ch)) {
6814 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006815 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 previous_is_cased = 1;
6817 cased = 1;
6818 }
6819 else
6820 previous_is_cased = 0;
6821 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006822 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823}
6824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006825PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006826"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006828Return True if all characters in S are whitespace\n\
6829and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830
6831static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006832unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833{
6834 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6835 register const Py_UNICODE *e;
6836
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 /* Shortcut for single character strings */
6838 if (PyUnicode_GET_SIZE(self) == 1 &&
6839 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006840 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006842 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006843 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006844 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006845
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 e = p + PyUnicode_GET_SIZE(self);
6847 for (; p < e; p++) {
6848 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006849 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006851 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852}
6853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006854PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006855"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006856\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006857Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006858and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006859
6860static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006861unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006862{
6863 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6864 register const Py_UNICODE *e;
6865
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006866 /* Shortcut for single character strings */
6867 if (PyUnicode_GET_SIZE(self) == 1 &&
6868 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006869 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006870
6871 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006872 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006873 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006874
6875 e = p + PyUnicode_GET_SIZE(self);
6876 for (; p < e; p++) {
6877 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006878 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006879 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006880 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006881}
6882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006883PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006884"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006885\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006886Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006887and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006888
6889static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006890unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006891{
6892 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6893 register const Py_UNICODE *e;
6894
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006895 /* Shortcut for single character strings */
6896 if (PyUnicode_GET_SIZE(self) == 1 &&
6897 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006898 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006899
6900 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006901 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006902 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006903
6904 e = p + PyUnicode_GET_SIZE(self);
6905 for (; p < e; p++) {
6906 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006907 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006908 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006909 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006910}
6911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006912PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006913"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006915Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006916False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917
6918static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006919unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920{
6921 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6922 register const Py_UNICODE *e;
6923
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 /* Shortcut for single character strings */
6925 if (PyUnicode_GET_SIZE(self) == 1 &&
6926 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006927 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006929 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006930 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006931 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006932
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933 e = p + PyUnicode_GET_SIZE(self);
6934 for (; p < e; p++) {
6935 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006936 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939}
6940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006941PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006942"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006944Return True if all characters in S are digits\n\
6945and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946
6947static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006948unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949{
6950 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6951 register const Py_UNICODE *e;
6952
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953 /* Shortcut for single character strings */
6954 if (PyUnicode_GET_SIZE(self) == 1 &&
6955 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006956 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006958 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006959 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006960 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006961
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 e = p + PyUnicode_GET_SIZE(self);
6963 for (; p < e; p++) {
6964 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006965 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006967 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968}
6969
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006970PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006971"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006973Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006974False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975
6976static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006977unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978{
6979 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6980 register const Py_UNICODE *e;
6981
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 /* Shortcut for single character strings */
6983 if (PyUnicode_GET_SIZE(self) == 1 &&
6984 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006985 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006987 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006988 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006989 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006990
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991 e = p + PyUnicode_GET_SIZE(self);
6992 for (; p < e; p++) {
6993 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006994 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006996 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997}
6998
Martin v. Löwis47383402007-08-15 07:32:56 +00006999int
7000PyUnicode_IsIdentifier(PyObject *self)
7001{
7002 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7003 register const Py_UNICODE *e;
7004
7005 /* Special case for empty strings */
7006 if (PyUnicode_GET_SIZE(self) == 0)
7007 return 0;
7008
7009 /* PEP 3131 says that the first character must be in
7010 XID_Start and subsequent characters in XID_Continue,
7011 and for the ASCII range, the 2.x rules apply (i.e
7012 start with letters and underscore, continue with
7013 letters, digits, underscore). However, given the current
7014 definition of XID_Start and XID_Continue, it is sufficient
7015 to check just for these, except that _ must be allowed
7016 as starting an identifier. */
7017 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7018 return 0;
7019
7020 e = p + PyUnicode_GET_SIZE(self);
7021 for (p++; p < e; p++) {
7022 if (!_PyUnicode_IsXidContinue(*p))
7023 return 0;
7024 }
7025 return 1;
7026}
7027
7028PyDoc_STRVAR(isidentifier__doc__,
7029"S.isidentifier() -> bool\n\
7030\n\
7031Return True if S is a valid identifier according\n\
7032to the language definition.");
7033
7034static PyObject*
7035unicode_isidentifier(PyObject *self)
7036{
7037 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7038}
7039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007040PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041"S.join(sequence) -> unicode\n\
7042\n\
7043Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007044sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045
7046static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007047unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007049 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050}
7051
Martin v. Löwis18e16552006-02-15 17:27:45 +00007052static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053unicode_length(PyUnicodeObject *self)
7054{
7055 return self->length;
7056}
7057
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007058PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007059"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060\n\
7061Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007062done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063
7064static PyObject *
7065unicode_ljust(PyUnicodeObject *self, PyObject *args)
7066{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007067 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007068 Py_UNICODE fillchar = ' ';
7069
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007070 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071 return NULL;
7072
Tim Peters7a29bd52001-09-12 03:03:31 +00007073 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 Py_INCREF(self);
7075 return (PyObject*) self;
7076 }
7077
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007078 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079}
7080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007081PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082"S.lower() -> unicode\n\
7083\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007084Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085
7086static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007087unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089 return fixup(self, fixlower);
7090}
7091
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007092#define LEFTSTRIP 0
7093#define RIGHTSTRIP 1
7094#define BOTHSTRIP 2
7095
7096/* Arrays indexed by above */
7097static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7098
7099#define STRIPNAME(i) (stripformat[i]+3)
7100
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007101/* externally visible for str.strip(unicode) */
7102PyObject *
7103_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7104{
7105 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007106 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007107 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007108 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7109 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007110
Thomas Wouters477c8d52006-05-27 19:21:47 +00007111 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7112
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007113 i = 0;
7114 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007115 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7116 i++;
7117 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007118 }
7119
7120 j = len;
7121 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007122 do {
7123 j--;
7124 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7125 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007126 }
7127
7128 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007129 Py_INCREF(self);
7130 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007131 }
7132 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007133 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007134}
7135
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136
7137static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007138do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007140 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007141 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007142
7143 i = 0;
7144 if (striptype != RIGHTSTRIP) {
7145 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7146 i++;
7147 }
7148 }
7149
7150 j = len;
7151 if (striptype != LEFTSTRIP) {
7152 do {
7153 j--;
7154 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7155 j++;
7156 }
7157
7158 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7159 Py_INCREF(self);
7160 return (PyObject*)self;
7161 }
7162 else
7163 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164}
7165
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007166
7167static PyObject *
7168do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7169{
7170 PyObject *sep = NULL;
7171
7172 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7173 return NULL;
7174
7175 if (sep != NULL && sep != Py_None) {
7176 if (PyUnicode_Check(sep))
7177 return _PyUnicode_XStrip(self, striptype, sep);
7178 else if (PyString_Check(sep)) {
7179 PyObject *res;
7180 sep = PyUnicode_FromObject(sep);
7181 if (sep==NULL)
7182 return NULL;
7183 res = _PyUnicode_XStrip(self, striptype, sep);
7184 Py_DECREF(sep);
7185 return res;
7186 }
7187 else {
7188 PyErr_Format(PyExc_TypeError,
7189 "%s arg must be None, unicode or str",
7190 STRIPNAME(striptype));
7191 return NULL;
7192 }
7193 }
7194
7195 return do_strip(self, striptype);
7196}
7197
7198
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007199PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007200"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007201\n\
7202Return a copy of the string S with leading and trailing\n\
7203whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007204If chars is given and not None, remove characters in chars instead.\n\
7205If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007206
7207static PyObject *
7208unicode_strip(PyUnicodeObject *self, PyObject *args)
7209{
7210 if (PyTuple_GET_SIZE(args) == 0)
7211 return do_strip(self, BOTHSTRIP); /* Common case */
7212 else
7213 return do_argstrip(self, BOTHSTRIP, args);
7214}
7215
7216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007217PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007218"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007219\n\
7220Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007221If chars is given and not None, remove characters in chars instead.\n\
7222If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007223
7224static PyObject *
7225unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7226{
7227 if (PyTuple_GET_SIZE(args) == 0)
7228 return do_strip(self, LEFTSTRIP); /* Common case */
7229 else
7230 return do_argstrip(self, LEFTSTRIP, args);
7231}
7232
7233
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007234PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007235"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007236\n\
7237Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007238If chars is given and not None, remove characters in chars instead.\n\
7239If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007240
7241static PyObject *
7242unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7243{
7244 if (PyTuple_GET_SIZE(args) == 0)
7245 return do_strip(self, RIGHTSTRIP); /* Common case */
7246 else
7247 return do_argstrip(self, RIGHTSTRIP, args);
7248}
7249
7250
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007252unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253{
7254 PyUnicodeObject *u;
7255 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007256 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007257 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258
7259 if (len < 0)
7260 len = 0;
7261
Tim Peters7a29bd52001-09-12 03:03:31 +00007262 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 /* no repeat, return original string */
7264 Py_INCREF(str);
7265 return (PyObject*) str;
7266 }
Tim Peters8f422462000-09-09 06:13:41 +00007267
7268 /* ensure # of chars needed doesn't overflow int and # of bytes
7269 * needed doesn't overflow size_t
7270 */
7271 nchars = len * str->length;
7272 if (len && nchars / len != str->length) {
7273 PyErr_SetString(PyExc_OverflowError,
7274 "repeated string is too long");
7275 return NULL;
7276 }
7277 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7278 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7279 PyErr_SetString(PyExc_OverflowError,
7280 "repeated string is too long");
7281 return NULL;
7282 }
7283 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284 if (!u)
7285 return NULL;
7286
7287 p = u->str;
7288
Thomas Wouters477c8d52006-05-27 19:21:47 +00007289 if (str->length == 1 && len > 0) {
7290 Py_UNICODE_FILL(p, str->str[0], len);
7291 } else {
7292 Py_ssize_t done = 0; /* number of characters copied this far */
7293 if (done < nchars) {
7294 Py_UNICODE_COPY(p, str->str, str->length);
7295 done = str->length;
7296 }
7297 while (done < nchars) {
7298 int n = (done <= nchars-done) ? done : nchars-done;
7299 Py_UNICODE_COPY(p+done, p, n);
7300 done += n;
7301 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 }
7303
7304 return (PyObject*) u;
7305}
7306
7307PyObject *PyUnicode_Replace(PyObject *obj,
7308 PyObject *subobj,
7309 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007310 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311{
7312 PyObject *self;
7313 PyObject *str1;
7314 PyObject *str2;
7315 PyObject *result;
7316
7317 self = PyUnicode_FromObject(obj);
7318 if (self == NULL)
7319 return NULL;
7320 str1 = PyUnicode_FromObject(subobj);
7321 if (str1 == NULL) {
7322 Py_DECREF(self);
7323 return NULL;
7324 }
7325 str2 = PyUnicode_FromObject(replobj);
7326 if (str2 == NULL) {
7327 Py_DECREF(self);
7328 Py_DECREF(str1);
7329 return NULL;
7330 }
Tim Petersced69f82003-09-16 20:30:58 +00007331 result = replace((PyUnicodeObject *)self,
7332 (PyUnicodeObject *)str1,
7333 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 maxcount);
7335 Py_DECREF(self);
7336 Py_DECREF(str1);
7337 Py_DECREF(str2);
7338 return result;
7339}
7340
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007341PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342"S.replace (old, new[, maxsplit]) -> unicode\n\
7343\n\
7344Return a copy of S with all occurrences of substring\n\
7345old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007346given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347
7348static PyObject*
7349unicode_replace(PyUnicodeObject *self, PyObject *args)
7350{
7351 PyUnicodeObject *str1;
7352 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007353 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354 PyObject *result;
7355
Martin v. Löwis18e16552006-02-15 17:27:45 +00007356 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357 return NULL;
7358 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7359 if (str1 == NULL)
7360 return NULL;
7361 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007362 if (str2 == NULL) {
7363 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007365 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366
7367 result = replace(self, str1, str2, maxcount);
7368
7369 Py_DECREF(str1);
7370 Py_DECREF(str2);
7371 return result;
7372}
7373
7374static
7375PyObject *unicode_repr(PyObject *unicode)
7376{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007377 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007378 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007379 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7380 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7381
7382 /* XXX(nnorwitz): rather than over-allocating, it would be
7383 better to choose a different scheme. Perhaps scan the
7384 first N-chars of the string and allocate based on that size.
7385 */
7386 /* Initial allocation is based on the longest-possible unichr
7387 escape.
7388
7389 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7390 unichr, so in this case it's the longest unichr escape. In
7391 narrow (UTF-16) builds this is five chars per source unichr
7392 since there are two unichrs in the surrogate pair, so in narrow
7393 (UTF-16) builds it's not the longest unichr escape.
7394
7395 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7396 so in the narrow (UTF-16) build case it's the longest unichr
7397 escape.
7398 */
7399
Walter Dörwald1ab83302007-05-18 17:15:44 +00007400 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007401 2 /* quotes */
7402#ifdef Py_UNICODE_WIDE
7403 + 10*size
7404#else
7405 + 6*size
7406#endif
7407 + 1);
7408 if (repr == NULL)
7409 return NULL;
7410
Walter Dörwald1ab83302007-05-18 17:15:44 +00007411 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007412
7413 /* Add quote */
7414 *p++ = (findchar(s, size, '\'') &&
7415 !findchar(s, size, '"')) ? '"' : '\'';
7416 while (size-- > 0) {
7417 Py_UNICODE ch = *s++;
7418
7419 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007420 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007421 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007422 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007423 continue;
7424 }
7425
7426#ifdef Py_UNICODE_WIDE
7427 /* Map 21-bit characters to '\U00xxxxxx' */
7428 else if (ch >= 0x10000) {
7429 *p++ = '\\';
7430 *p++ = 'U';
7431 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7432 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7433 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7434 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7435 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7436 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7437 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7438 *p++ = hexdigits[ch & 0x0000000F];
7439 continue;
7440 }
7441#else
7442 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7443 else if (ch >= 0xD800 && ch < 0xDC00) {
7444 Py_UNICODE ch2;
7445 Py_UCS4 ucs;
7446
7447 ch2 = *s++;
7448 size--;
7449 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7450 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7451 *p++ = '\\';
7452 *p++ = 'U';
7453 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7454 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7455 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7456 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7457 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7458 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7459 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7460 *p++ = hexdigits[ucs & 0x0000000F];
7461 continue;
7462 }
7463 /* Fall through: isolated surrogates are copied as-is */
7464 s--;
7465 size++;
7466 }
7467#endif
7468
7469 /* Map 16-bit characters to '\uxxxx' */
7470 if (ch >= 256) {
7471 *p++ = '\\';
7472 *p++ = 'u';
7473 *p++ = hexdigits[(ch >> 12) & 0x000F];
7474 *p++ = hexdigits[(ch >> 8) & 0x000F];
7475 *p++ = hexdigits[(ch >> 4) & 0x000F];
7476 *p++ = hexdigits[ch & 0x000F];
7477 }
7478
7479 /* Map special whitespace to '\t', \n', '\r' */
7480 else if (ch == '\t') {
7481 *p++ = '\\';
7482 *p++ = 't';
7483 }
7484 else if (ch == '\n') {
7485 *p++ = '\\';
7486 *p++ = 'n';
7487 }
7488 else if (ch == '\r') {
7489 *p++ = '\\';
7490 *p++ = 'r';
7491 }
7492
7493 /* Map non-printable US ASCII to '\xhh' */
7494 else if (ch < ' ' || ch >= 0x7F) {
7495 *p++ = '\\';
7496 *p++ = 'x';
7497 *p++ = hexdigits[(ch >> 4) & 0x000F];
7498 *p++ = hexdigits[ch & 0x000F];
7499 }
7500
7501 /* Copy everything else as-is */
7502 else
7503 *p++ = (char) ch;
7504 }
7505 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007506 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007507
7508 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007509 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007510 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511}
7512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007513PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514"S.rfind(sub [,start [,end]]) -> int\n\
7515\n\
7516Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007517such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518arguments start and end are interpreted as in slice notation.\n\
7519\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007520Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521
7522static PyObject *
7523unicode_rfind(PyUnicodeObject *self, PyObject *args)
7524{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007525 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007526 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007527 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007528 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529
Guido van Rossumb8872e62000-05-09 14:14:27 +00007530 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7531 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007533 substring = PyUnicode_FromObject(substring);
7534 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535 return NULL;
7536
Thomas Wouters477c8d52006-05-27 19:21:47 +00007537 result = stringlib_rfind_slice(
7538 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7539 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7540 start, end
7541 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542
7543 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007544
7545 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546}
7547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007548PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549"S.rindex(sub [,start [,end]]) -> int\n\
7550\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007551Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552
7553static PyObject *
7554unicode_rindex(PyUnicodeObject *self, PyObject *args)
7555{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007556 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007557 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007558 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007559 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560
Guido van Rossumb8872e62000-05-09 14:14:27 +00007561 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7562 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007564 substring = PyUnicode_FromObject(substring);
7565 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566 return NULL;
7567
Thomas Wouters477c8d52006-05-27 19:21:47 +00007568 result = stringlib_rfind_slice(
7569 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7570 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7571 start, end
7572 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573
7574 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007575
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576 if (result < 0) {
7577 PyErr_SetString(PyExc_ValueError, "substring not found");
7578 return NULL;
7579 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007580 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581}
7582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007583PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007584"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585\n\
7586Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007587done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588
7589static PyObject *
7590unicode_rjust(PyUnicodeObject *self, PyObject *args)
7591{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007592 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007593 Py_UNICODE fillchar = ' ';
7594
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007595 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596 return NULL;
7597
Tim Peters7a29bd52001-09-12 03:03:31 +00007598 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599 Py_INCREF(self);
7600 return (PyObject*) self;
7601 }
7602
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007603 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604}
7605
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606PyObject *PyUnicode_Split(PyObject *s,
7607 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007608 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609{
7610 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007611
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 s = PyUnicode_FromObject(s);
7613 if (s == NULL)
7614 return NULL;
7615 if (sep != NULL) {
7616 sep = PyUnicode_FromObject(sep);
7617 if (sep == NULL) {
7618 Py_DECREF(s);
7619 return NULL;
7620 }
7621 }
7622
7623 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7624
7625 Py_DECREF(s);
7626 Py_XDECREF(sep);
7627 return result;
7628}
7629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007630PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631"S.split([sep [,maxsplit]]) -> list of strings\n\
7632\n\
7633Return a list of the words in S, using sep as the\n\
7634delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007635splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007636any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637
7638static PyObject*
7639unicode_split(PyUnicodeObject *self, PyObject *args)
7640{
7641 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007642 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643
Martin v. Löwis18e16552006-02-15 17:27:45 +00007644 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645 return NULL;
7646
7647 if (substring == Py_None)
7648 return split(self, NULL, maxcount);
7649 else if (PyUnicode_Check(substring))
7650 return split(self, (PyUnicodeObject *)substring, maxcount);
7651 else
7652 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7653}
7654
Thomas Wouters477c8d52006-05-27 19:21:47 +00007655PyObject *
7656PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7657{
7658 PyObject* str_obj;
7659 PyObject* sep_obj;
7660 PyObject* out;
7661
7662 str_obj = PyUnicode_FromObject(str_in);
7663 if (!str_obj)
7664 return NULL;
7665 sep_obj = PyUnicode_FromObject(sep_in);
7666 if (!sep_obj) {
7667 Py_DECREF(str_obj);
7668 return NULL;
7669 }
7670
7671 out = stringlib_partition(
7672 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7673 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7674 );
7675
7676 Py_DECREF(sep_obj);
7677 Py_DECREF(str_obj);
7678
7679 return out;
7680}
7681
7682
7683PyObject *
7684PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7685{
7686 PyObject* str_obj;
7687 PyObject* sep_obj;
7688 PyObject* out;
7689
7690 str_obj = PyUnicode_FromObject(str_in);
7691 if (!str_obj)
7692 return NULL;
7693 sep_obj = PyUnicode_FromObject(sep_in);
7694 if (!sep_obj) {
7695 Py_DECREF(str_obj);
7696 return NULL;
7697 }
7698
7699 out = stringlib_rpartition(
7700 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7701 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7702 );
7703
7704 Py_DECREF(sep_obj);
7705 Py_DECREF(str_obj);
7706
7707 return out;
7708}
7709
7710PyDoc_STRVAR(partition__doc__,
7711"S.partition(sep) -> (head, sep, tail)\n\
7712\n\
7713Searches for the separator sep in S, and returns the part before it,\n\
7714the separator itself, and the part after it. If the separator is not\n\
7715found, returns S and two empty strings.");
7716
7717static PyObject*
7718unicode_partition(PyUnicodeObject *self, PyObject *separator)
7719{
7720 return PyUnicode_Partition((PyObject *)self, separator);
7721}
7722
7723PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007724"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007725\n\
7726Searches for the separator sep in S, starting at the end of S, and returns\n\
7727the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007728separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007729
7730static PyObject*
7731unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7732{
7733 return PyUnicode_RPartition((PyObject *)self, separator);
7734}
7735
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007736PyObject *PyUnicode_RSplit(PyObject *s,
7737 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007738 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007739{
7740 PyObject *result;
7741
7742 s = PyUnicode_FromObject(s);
7743 if (s == NULL)
7744 return NULL;
7745 if (sep != NULL) {
7746 sep = PyUnicode_FromObject(sep);
7747 if (sep == NULL) {
7748 Py_DECREF(s);
7749 return NULL;
7750 }
7751 }
7752
7753 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7754
7755 Py_DECREF(s);
7756 Py_XDECREF(sep);
7757 return result;
7758}
7759
7760PyDoc_STRVAR(rsplit__doc__,
7761"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7762\n\
7763Return a list of the words in S, using sep as the\n\
7764delimiter string, starting at the end of the string and\n\
7765working to the front. If maxsplit is given, at most maxsplit\n\
7766splits are done. If sep is not specified, any whitespace string\n\
7767is a separator.");
7768
7769static PyObject*
7770unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7771{
7772 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007773 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007774
Martin v. Löwis18e16552006-02-15 17:27:45 +00007775 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007776 return NULL;
7777
7778 if (substring == Py_None)
7779 return rsplit(self, NULL, maxcount);
7780 else if (PyUnicode_Check(substring))
7781 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7782 else
7783 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7784}
7785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007786PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007787"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788\n\
7789Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007790Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007791is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792
7793static PyObject*
7794unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7795{
Guido van Rossum86662912000-04-11 15:38:46 +00007796 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797
Guido van Rossum86662912000-04-11 15:38:46 +00007798 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799 return NULL;
7800
Guido van Rossum86662912000-04-11 15:38:46 +00007801 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802}
7803
7804static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007805PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806{
Walter Dörwald346737f2007-05-31 10:44:43 +00007807 if (PyUnicode_CheckExact(self)) {
7808 Py_INCREF(self);
7809 return self;
7810 } else
7811 /* Subtype -- return genuine unicode string with the same value. */
7812 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7813 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814}
7815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007816PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817"S.swapcase() -> unicode\n\
7818\n\
7819Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007820and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821
7822static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007823unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825 return fixup(self, fixswapcase);
7826}
7827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007828PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829"S.translate(table) -> unicode\n\
7830\n\
7831Return a copy of the string S, where all characters have been mapped\n\
7832through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007833Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7834Unmapped characters are left untouched. Characters mapped to None\n\
7835are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836
7837static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007838unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839{
Georg Brandl94c2c752007-10-23 06:52:59 +00007840 PyObject *newtable = NULL;
7841 Py_ssize_t i = 0;
7842 PyObject *key, *value, *result;
7843
7844 if (!PyDict_Check(table)) {
7845 PyErr_SetString(PyExc_TypeError, "translate argument must be a dict");
7846 return NULL;
7847 }
7848 /* fixup the table -- allow size-1 string keys instead of only int keys */
7849 newtable = PyDict_Copy(table);
7850 if (!newtable) return NULL;
7851 while (PyDict_Next(table, &i, &key, &value)) {
7852 if (PyUnicode_Check(key)) {
7853 /* convert string keys to integer keys */
7854 PyObject *newkey;
7855 int res;
7856 if (PyUnicode_GET_SIZE(key) != 1) {
7857 PyErr_SetString(PyExc_ValueError, "string items in translate "
7858 "table must be 1 element long");
7859 goto err;
7860 }
7861 newkey = PyInt_FromLong(PyUnicode_AS_UNICODE(key)[0]);
7862 if (!newkey)
7863 goto err;
7864 res = PyDict_SetItem(newtable, newkey, value);
7865 Py_DECREF(newkey);
7866 if (res < 0)
7867 goto err;
7868 } else if (PyInt_Check(key)) {
7869 /* just keep integer keys */
7870 if (PyDict_SetItem(newtable, key, value) < 0)
7871 goto err;
7872 } else {
7873 PyErr_SetString(PyExc_TypeError, "items in translate table must be "
7874 "strings or integers");
7875 goto err;
7876 }
7877 }
7878
7879 result = PyUnicode_TranslateCharmap(self->str,
7880 self->length,
7881 newtable,
7882 "ignore");
7883 Py_DECREF(newtable);
7884 return result;
7885 err:
7886 Py_DECREF(newtable);
7887 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888}
7889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007890PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891"S.upper() -> unicode\n\
7892\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007893Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894
7895static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007896unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898 return fixup(self, fixupper);
7899}
7900
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007901PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902"S.zfill(width) -> unicode\n\
7903\n\
7904Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007905of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007906
7907static PyObject *
7908unicode_zfill(PyUnicodeObject *self, PyObject *args)
7909{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007910 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911 PyUnicodeObject *u;
7912
Martin v. Löwis18e16552006-02-15 17:27:45 +00007913 Py_ssize_t width;
7914 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007915 return NULL;
7916
7917 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007918 if (PyUnicode_CheckExact(self)) {
7919 Py_INCREF(self);
7920 return (PyObject*) self;
7921 }
7922 else
7923 return PyUnicode_FromUnicode(
7924 PyUnicode_AS_UNICODE(self),
7925 PyUnicode_GET_SIZE(self)
7926 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927 }
7928
7929 fill = width - self->length;
7930
7931 u = pad(self, fill, 0, '0');
7932
Walter Dörwald068325e2002-04-15 13:36:47 +00007933 if (u == NULL)
7934 return NULL;
7935
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936 if (u->str[fill] == '+' || u->str[fill] == '-') {
7937 /* move sign to beginning of string */
7938 u->str[0] = u->str[fill];
7939 u->str[fill] = '0';
7940 }
7941
7942 return (PyObject*) u;
7943}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944
7945#if 0
7946static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007947unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949 return PyInt_FromLong(unicode_freelist_size);
7950}
7951#endif
7952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007953PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007954"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007956Return True if S starts with the specified prefix, False otherwise.\n\
7957With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007958With optional end, stop comparing S at that position.\n\
7959prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960
7961static PyObject *
7962unicode_startswith(PyUnicodeObject *self,
7963 PyObject *args)
7964{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007965 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007967 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007968 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007969 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007971 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007972 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007974 if (PyTuple_Check(subobj)) {
7975 Py_ssize_t i;
7976 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7977 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7978 PyTuple_GET_ITEM(subobj, i));
7979 if (substring == NULL)
7980 return NULL;
7981 result = tailmatch(self, substring, start, end, -1);
7982 Py_DECREF(substring);
7983 if (result) {
7984 Py_RETURN_TRUE;
7985 }
7986 }
7987 /* nothing matched */
7988 Py_RETURN_FALSE;
7989 }
7990 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007992 return NULL;
7993 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007995 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996}
7997
7998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007999PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008000"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008002Return True if S ends with the specified suffix, False otherwise.\n\
8003With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008004With optional end, stop comparing S at that position.\n\
8005suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006
8007static PyObject *
8008unicode_endswith(PyUnicodeObject *self,
8009 PyObject *args)
8010{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008011 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008013 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008014 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008015 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008017 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8018 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008020 if (PyTuple_Check(subobj)) {
8021 Py_ssize_t i;
8022 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8023 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8024 PyTuple_GET_ITEM(subobj, i));
8025 if (substring == NULL)
8026 return NULL;
8027 result = tailmatch(self, substring, start, end, +1);
8028 Py_DECREF(substring);
8029 if (result) {
8030 Py_RETURN_TRUE;
8031 }
8032 }
8033 Py_RETURN_FALSE;
8034 }
8035 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008037 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008039 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008041 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042}
8043
Eric Smith8c663262007-08-25 02:26:07 +00008044#include "stringlib/string_format.h"
8045
8046PyDoc_STRVAR(format__doc__,
8047"S.format(*args, **kwargs) -> unicode\n\
8048\n\
8049");
8050
Eric Smith8c663262007-08-25 02:26:07 +00008051PyDoc_STRVAR(p_format__doc__,
8052"S.__format__(format_spec) -> unicode\n\
8053\n\
8054");
8055
8056static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008057unicode_getnewargs(PyUnicodeObject *v)
8058{
8059 return Py_BuildValue("(u#)", v->str, v->length);
8060}
8061
8062
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063static PyMethodDef unicode_methods[] = {
8064
8065 /* Order is according to common usage: often used methods should
8066 appear first, since lookup is done sequentially. */
8067
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008068 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8069 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8070 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008071 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008072 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8073 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8074 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8075 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8076 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8077 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8078 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008079 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008080 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8081 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8082 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008083 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008084 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8085 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8086 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008087 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008088 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008089 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008090 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008091 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8092 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8093 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8094 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8095 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8096 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8097 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8098 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8099 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8100 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8101 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8102 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8103 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8104 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008105 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008106 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008107 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8108 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008109 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8110 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00008111#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008112 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113#endif
8114
8115#if 0
8116 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008117 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118#endif
8119
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008120 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121 {NULL, NULL}
8122};
8123
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008124static PyObject *
8125unicode_mod(PyObject *v, PyObject *w)
8126{
8127 if (!PyUnicode_Check(v)) {
8128 Py_INCREF(Py_NotImplemented);
8129 return Py_NotImplemented;
8130 }
8131 return PyUnicode_Format(v, w);
8132}
8133
8134static PyNumberMethods unicode_as_number = {
8135 0, /*nb_add*/
8136 0, /*nb_subtract*/
8137 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008138 unicode_mod, /*nb_remainder*/
8139};
8140
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008142 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008143 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008144 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8145 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008146 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 0, /* sq_ass_item */
8148 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008149 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150};
8151
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008152static PyObject*
8153unicode_subscript(PyUnicodeObject* self, PyObject* item)
8154{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008155 if (PyIndex_Check(item)) {
8156 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008157 if (i == -1 && PyErr_Occurred())
8158 return NULL;
8159 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008160 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008161 return unicode_getitem(self, i);
8162 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008163 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008164 Py_UNICODE* source_buf;
8165 Py_UNICODE* result_buf;
8166 PyObject* result;
8167
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008168 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008169 &start, &stop, &step, &slicelength) < 0) {
8170 return NULL;
8171 }
8172
8173 if (slicelength <= 0) {
8174 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008175 } else if (start == 0 && step == 1 && slicelength == self->length &&
8176 PyUnicode_CheckExact(self)) {
8177 Py_INCREF(self);
8178 return (PyObject *)self;
8179 } else if (step == 1) {
8180 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008181 } else {
8182 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008183 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8184 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008185
8186 if (result_buf == NULL)
8187 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008188
8189 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8190 result_buf[i] = source_buf[cur];
8191 }
Tim Petersced69f82003-09-16 20:30:58 +00008192
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008193 result = PyUnicode_FromUnicode(result_buf, slicelength);
8194 PyMem_FREE(result_buf);
8195 return result;
8196 }
8197 } else {
8198 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8199 return NULL;
8200 }
8201}
8202
8203static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008204 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008205 (binaryfunc)unicode_subscript, /* mp_subscript */
8206 (objobjargproc)0, /* mp_ass_subscript */
8207};
8208
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210/* Helpers for PyUnicode_Format() */
8211
8212static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008213getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008214{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008215 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 if (argidx < arglen) {
8217 (*p_argidx)++;
8218 if (arglen < 0)
8219 return args;
8220 else
8221 return PyTuple_GetItem(args, argidx);
8222 }
8223 PyErr_SetString(PyExc_TypeError,
8224 "not enough arguments for format string");
8225 return NULL;
8226}
8227
8228#define F_LJUST (1<<0)
8229#define F_SIGN (1<<1)
8230#define F_BLANK (1<<2)
8231#define F_ALT (1<<3)
8232#define F_ZERO (1<<4)
8233
Martin v. Löwis18e16552006-02-15 17:27:45 +00008234static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008235strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008237 register Py_ssize_t i;
8238 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 for (i = len - 1; i >= 0; i--)
8240 buffer[i] = (Py_UNICODE) charbuffer[i];
8241
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 return len;
8243}
8244
Neal Norwitzfc76d632006-01-10 06:03:13 +00008245static int
8246doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8247{
Tim Peters15231542006-02-16 01:08:01 +00008248 Py_ssize_t result;
8249
Neal Norwitzfc76d632006-01-10 06:03:13 +00008250 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008251 result = strtounicode(buffer, (char *)buffer);
8252 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008253}
8254
8255static int
8256longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8257{
Tim Peters15231542006-02-16 01:08:01 +00008258 Py_ssize_t result;
8259
Neal Norwitzfc76d632006-01-10 06:03:13 +00008260 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008261 result = strtounicode(buffer, (char *)buffer);
8262 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008263}
8264
Guido van Rossum078151d2002-08-11 04:24:12 +00008265/* XXX To save some code duplication, formatfloat/long/int could have been
8266 shared with stringobject.c, converting from 8-bit to Unicode after the
8267 formatting is done. */
8268
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269static int
8270formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008271 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 int flags,
8273 int prec,
8274 int type,
8275 PyObject *v)
8276{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008277 /* fmt = '%#.' + `prec` + `type`
8278 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 char fmt[20];
8280 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008281
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 x = PyFloat_AsDouble(v);
8283 if (x == -1.0 && PyErr_Occurred())
8284 return -1;
8285 if (prec < 0)
8286 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8288 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008289 /* Worst case length calc to ensure no buffer overrun:
8290
8291 'g' formats:
8292 fmt = %#.<prec>g
8293 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8294 for any double rep.)
8295 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8296
8297 'f' formats:
8298 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8299 len = 1 + 50 + 1 + prec = 52 + prec
8300
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008301 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008302 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008303
8304 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008305 if (((type == 'g' || type == 'G') &&
8306 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008307 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008308 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008309 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008310 return -1;
8311 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008312 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8313 (flags&F_ALT) ? "#" : "",
8314 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008315 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316}
8317
Tim Peters38fd5b62000-09-21 05:43:11 +00008318static PyObject*
8319formatlong(PyObject *val, int flags, int prec, int type)
8320{
8321 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008322 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008323 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008324 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008325
8326 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8327 if (!str)
8328 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008329 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008330 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008331 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008332}
8333
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334static int
8335formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008336 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 int flags,
8338 int prec,
8339 int type,
8340 PyObject *v)
8341{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008342 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008343 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8344 * + 1 + 1
8345 * = 24
8346 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008347 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008348 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349 long x;
8350
8351 x = PyInt_AsLong(v);
8352 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008353 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008354 if (x < 0 && type == 'u') {
8355 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008356 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008357 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8358 sign = "-";
8359 else
8360 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008362 prec = 1;
8363
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008364 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8365 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008366 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008367 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008368 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008369 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008370 return -1;
8371 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008372
8373 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008374 (type == 'x' || type == 'X' || type == 'o')) {
8375 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008376 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008377 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008378 * - when 0 is being converted, the C standard leaves off
8379 * the '0x' or '0X', which is inconsistent with other
8380 * %#x/%#X conversions and inconsistent with Python's
8381 * hex() function
8382 * - there are platforms that violate the standard and
8383 * convert 0 with the '0x' or '0X'
8384 * (Metrowerks, Compaq Tru64)
8385 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008386 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008387 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008388 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008389 * We can achieve the desired consistency by inserting our
8390 * own '0x' or '0X' prefix, and substituting %x/%X in place
8391 * of %#x/%#X.
8392 *
8393 * Note that this is the same approach as used in
8394 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008395 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008396 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8397 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008398 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008399 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008400 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8401 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008402 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008403 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008404 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008405 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008406 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008407 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008408}
8409
8410static int
8411formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008412 size_t buflen,
8413 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008415 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008416 if (PyUnicode_Check(v)) {
8417 if (PyUnicode_GET_SIZE(v) != 1)
8418 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008422 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008423 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008424 goto onError;
8425 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8426 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427
8428 else {
8429 /* Integer input truncated to a character */
8430 long x;
8431 x = PyInt_AsLong(v);
8432 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008433 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008434#ifdef Py_UNICODE_WIDE
8435 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008436 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008437 "%c arg not in range(0x110000) "
8438 "(wide Python build)");
8439 return -1;
8440 }
8441#else
8442 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008443 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008444 "%c arg not in range(0x10000) "
8445 "(narrow Python build)");
8446 return -1;
8447 }
8448#endif
8449 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450 }
8451 buf[1] = '\0';
8452 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008453
8454 onError:
8455 PyErr_SetString(PyExc_TypeError,
8456 "%c requires int or char");
8457 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458}
8459
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008460/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8461
8462 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8463 chars are formatted. XXX This is a magic number. Each formatting
8464 routine does bounds checking to ensure no overflow, but a better
8465 solution may be to malloc a buffer of appropriate size for each
8466 format. For now, the current solution is sufficient.
8467*/
8468#define FORMATBUFLEN (size_t)120
8469
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470PyObject *PyUnicode_Format(PyObject *format,
8471 PyObject *args)
8472{
8473 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008474 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475 int args_owned = 0;
8476 PyUnicodeObject *result = NULL;
8477 PyObject *dict = NULL;
8478 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008479
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480 if (format == NULL || args == NULL) {
8481 PyErr_BadInternalCall();
8482 return NULL;
8483 }
8484 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008485 if (uformat == NULL)
8486 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 fmt = PyUnicode_AS_UNICODE(uformat);
8488 fmtcnt = PyUnicode_GET_SIZE(uformat);
8489
8490 reslen = rescnt = fmtcnt + 100;
8491 result = _PyUnicode_New(reslen);
8492 if (result == NULL)
8493 goto onError;
8494 res = PyUnicode_AS_UNICODE(result);
8495
8496 if (PyTuple_Check(args)) {
8497 arglen = PyTuple_Size(args);
8498 argidx = 0;
8499 }
8500 else {
8501 arglen = -1;
8502 argidx = -2;
8503 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008504 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Guido van Rossum3172c5d2007-10-16 18:12:55 +00008505 !PyString_Check(args) && !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506 dict = args;
8507
8508 while (--fmtcnt >= 0) {
8509 if (*fmt != '%') {
8510 if (--rescnt < 0) {
8511 rescnt = fmtcnt + 100;
8512 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008513 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008514 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8516 --rescnt;
8517 }
8518 *res++ = *fmt++;
8519 }
8520 else {
8521 /* Got a format specifier */
8522 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008523 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525 Py_UNICODE c = '\0';
8526 Py_UNICODE fill;
8527 PyObject *v = NULL;
8528 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008529 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008531 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008532 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533
8534 fmt++;
8535 if (*fmt == '(') {
8536 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008537 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 PyObject *key;
8539 int pcount = 1;
8540
8541 if (dict == NULL) {
8542 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008543 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544 goto onError;
8545 }
8546 ++fmt;
8547 --fmtcnt;
8548 keystart = fmt;
8549 /* Skip over balanced parentheses */
8550 while (pcount > 0 && --fmtcnt >= 0) {
8551 if (*fmt == ')')
8552 --pcount;
8553 else if (*fmt == '(')
8554 ++pcount;
8555 fmt++;
8556 }
8557 keylen = fmt - keystart - 1;
8558 if (fmtcnt < 0 || pcount > 0) {
8559 PyErr_SetString(PyExc_ValueError,
8560 "incomplete format key");
8561 goto onError;
8562 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008563#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008564 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565 then looked up since Python uses strings to hold
8566 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008567 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568 key = PyUnicode_EncodeUTF8(keystart,
8569 keylen,
8570 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008571#else
8572 key = PyUnicode_FromUnicode(keystart, keylen);
8573#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574 if (key == NULL)
8575 goto onError;
8576 if (args_owned) {
8577 Py_DECREF(args);
8578 args_owned = 0;
8579 }
8580 args = PyObject_GetItem(dict, key);
8581 Py_DECREF(key);
8582 if (args == NULL) {
8583 goto onError;
8584 }
8585 args_owned = 1;
8586 arglen = -1;
8587 argidx = -2;
8588 }
8589 while (--fmtcnt >= 0) {
8590 switch (c = *fmt++) {
8591 case '-': flags |= F_LJUST; continue;
8592 case '+': flags |= F_SIGN; continue;
8593 case ' ': flags |= F_BLANK; continue;
8594 case '#': flags |= F_ALT; continue;
8595 case '0': flags |= F_ZERO; continue;
8596 }
8597 break;
8598 }
8599 if (c == '*') {
8600 v = getnextarg(args, arglen, &argidx);
8601 if (v == NULL)
8602 goto onError;
8603 if (!PyInt_Check(v)) {
8604 PyErr_SetString(PyExc_TypeError,
8605 "* wants int");
8606 goto onError;
8607 }
8608 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008609 if (width == -1 && PyErr_Occurred())
8610 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611 if (width < 0) {
8612 flags |= F_LJUST;
8613 width = -width;
8614 }
8615 if (--fmtcnt >= 0)
8616 c = *fmt++;
8617 }
8618 else if (c >= '0' && c <= '9') {
8619 width = c - '0';
8620 while (--fmtcnt >= 0) {
8621 c = *fmt++;
8622 if (c < '0' || c > '9')
8623 break;
8624 if ((width*10) / 10 != width) {
8625 PyErr_SetString(PyExc_ValueError,
8626 "width too big");
8627 goto onError;
8628 }
8629 width = width*10 + (c - '0');
8630 }
8631 }
8632 if (c == '.') {
8633 prec = 0;
8634 if (--fmtcnt >= 0)
8635 c = *fmt++;
8636 if (c == '*') {
8637 v = getnextarg(args, arglen, &argidx);
8638 if (v == NULL)
8639 goto onError;
8640 if (!PyInt_Check(v)) {
8641 PyErr_SetString(PyExc_TypeError,
8642 "* wants int");
8643 goto onError;
8644 }
8645 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008646 if (prec == -1 && PyErr_Occurred())
8647 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 if (prec < 0)
8649 prec = 0;
8650 if (--fmtcnt >= 0)
8651 c = *fmt++;
8652 }
8653 else if (c >= '0' && c <= '9') {
8654 prec = c - '0';
8655 while (--fmtcnt >= 0) {
8656 c = Py_CHARMASK(*fmt++);
8657 if (c < '0' || c > '9')
8658 break;
8659 if ((prec*10) / 10 != prec) {
8660 PyErr_SetString(PyExc_ValueError,
8661 "prec too big");
8662 goto onError;
8663 }
8664 prec = prec*10 + (c - '0');
8665 }
8666 }
8667 } /* prec */
8668 if (fmtcnt >= 0) {
8669 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670 if (--fmtcnt >= 0)
8671 c = *fmt++;
8672 }
8673 }
8674 if (fmtcnt < 0) {
8675 PyErr_SetString(PyExc_ValueError,
8676 "incomplete format");
8677 goto onError;
8678 }
8679 if (c != '%') {
8680 v = getnextarg(args, arglen, &argidx);
8681 if (v == NULL)
8682 goto onError;
8683 }
8684 sign = 0;
8685 fill = ' ';
8686 switch (c) {
8687
8688 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008689 pbuf = formatbuf;
8690 /* presume that buffer length is at least 1 */
8691 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 len = 1;
8693 break;
8694
8695 case 's':
8696 case 'r':
8697 if (PyUnicode_Check(v) && c == 's') {
8698 temp = v;
8699 Py_INCREF(temp);
8700 }
8701 else {
8702 PyObject *unicode;
8703 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008704 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705 else
8706 temp = PyObject_Repr(v);
8707 if (temp == NULL)
8708 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008709 if (PyUnicode_Check(temp))
8710 /* nothing to do */;
8711 else if (PyString_Check(temp)) {
8712 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008713 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008715 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008717 Py_DECREF(temp);
8718 temp = unicode;
8719 if (temp == NULL)
8720 goto onError;
8721 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008722 else {
8723 Py_DECREF(temp);
8724 PyErr_SetString(PyExc_TypeError,
8725 "%s argument has non-string str()");
8726 goto onError;
8727 }
8728 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008729 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730 len = PyUnicode_GET_SIZE(temp);
8731 if (prec >= 0 && len > prec)
8732 len = prec;
8733 break;
8734
8735 case 'i':
8736 case 'd':
8737 case 'u':
8738 case 'o':
8739 case 'x':
8740 case 'X':
8741 if (c == 'i')
8742 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008743 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008744 temp = formatlong(v, flags, prec, c);
8745 if (!temp)
8746 goto onError;
8747 pbuf = PyUnicode_AS_UNICODE(temp);
8748 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008749 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008751 else {
8752 pbuf = formatbuf;
8753 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8754 flags, prec, c, v);
8755 if (len < 0)
8756 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008757 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008758 }
8759 if (flags & F_ZERO)
8760 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761 break;
8762
8763 case 'e':
8764 case 'E':
8765 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008766 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767 case 'g':
8768 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008769 if (c == 'F')
8770 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008771 pbuf = formatbuf;
8772 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8773 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 if (len < 0)
8775 goto onError;
8776 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008777 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 fill = '0';
8779 break;
8780
8781 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008782 pbuf = formatbuf;
8783 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784 if (len < 0)
8785 goto onError;
8786 break;
8787
8788 default:
8789 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008790 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008791 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008792 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008793 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008794 (Py_ssize_t)(fmt - 1 -
8795 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 goto onError;
8797 }
8798 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008799 if (*pbuf == '-' || *pbuf == '+') {
8800 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801 len--;
8802 }
8803 else if (flags & F_SIGN)
8804 sign = '+';
8805 else if (flags & F_BLANK)
8806 sign = ' ';
8807 else
8808 sign = 0;
8809 }
8810 if (width < len)
8811 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008812 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813 reslen -= rescnt;
8814 rescnt = width + fmtcnt + 100;
8815 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008816 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008817 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008818 PyErr_NoMemory();
8819 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008820 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008821 if (_PyUnicode_Resize(&result, reslen) < 0) {
8822 Py_XDECREF(temp);
8823 goto onError;
8824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008825 res = PyUnicode_AS_UNICODE(result)
8826 + reslen - rescnt;
8827 }
8828 if (sign) {
8829 if (fill != ' ')
8830 *res++ = sign;
8831 rescnt--;
8832 if (width > len)
8833 width--;
8834 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008835 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008836 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008837 assert(pbuf[1] == c);
8838 if (fill != ' ') {
8839 *res++ = *pbuf++;
8840 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008841 }
Tim Petersfff53252001-04-12 18:38:48 +00008842 rescnt -= 2;
8843 width -= 2;
8844 if (width < 0)
8845 width = 0;
8846 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008847 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848 if (width > len && !(flags & F_LJUST)) {
8849 do {
8850 --rescnt;
8851 *res++ = fill;
8852 } while (--width > len);
8853 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008854 if (fill == ' ') {
8855 if (sign)
8856 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008857 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008858 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008859 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008860 *res++ = *pbuf++;
8861 *res++ = *pbuf++;
8862 }
8863 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008864 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 res += len;
8866 rescnt -= len;
8867 while (--width >= len) {
8868 --rescnt;
8869 *res++ = ' ';
8870 }
8871 if (dict && (argidx < arglen) && c != '%') {
8872 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008873 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008874 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875 goto onError;
8876 }
8877 Py_XDECREF(temp);
8878 } /* '%' */
8879 } /* until end */
8880 if (argidx < arglen && !dict) {
8881 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008882 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883 goto onError;
8884 }
8885
Thomas Woutersa96affe2006-03-12 00:29:36 +00008886 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8887 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888 if (args_owned) {
8889 Py_DECREF(args);
8890 }
8891 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892 return (PyObject *)result;
8893
8894 onError:
8895 Py_XDECREF(result);
8896 Py_DECREF(uformat);
8897 if (args_owned) {
8898 Py_DECREF(args);
8899 }
8900 return NULL;
8901}
8902
Jeremy Hylton938ace62002-07-17 16:30:39 +00008903static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008904unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8905
Tim Peters6d6c1a32001-08-02 04:15:00 +00008906static PyObject *
8907unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8908{
8909 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008910 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008911 char *encoding = NULL;
8912 char *errors = NULL;
8913
Guido van Rossume023fe02001-08-30 03:12:59 +00008914 if (type != &PyUnicode_Type)
8915 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008916 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8917 kwlist, &x, &encoding, &errors))
8918 return NULL;
8919 if (x == NULL)
8920 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008921 if (encoding == NULL && errors == NULL)
8922 return PyObject_Unicode(x);
8923 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008924 return PyUnicode_FromEncodedObject(x, encoding, errors);
8925}
8926
Guido van Rossume023fe02001-08-30 03:12:59 +00008927static PyObject *
8928unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8929{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008930 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008931 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008932
8933 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8934 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8935 if (tmp == NULL)
8936 return NULL;
8937 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008938 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008939 if (pnew == NULL) {
8940 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008941 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008942 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008943 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8944 if (pnew->str == NULL) {
8945 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008946 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008947 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008948 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008949 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008950 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8951 pnew->length = n;
8952 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008953 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008954 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008955}
8956
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008957PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008958"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008959\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008960Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008961encoding defaults to the current default string encoding.\n\
8962errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008963
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008964static PyObject *unicode_iter(PyObject *seq);
8965
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008967 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008968 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 sizeof(PyUnicodeObject), /* tp_size */
8970 0, /* tp_itemsize */
8971 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008972 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008974 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008976 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008977 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008978 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008980 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981 (hashfunc) unicode_hash, /* tp_hash*/
8982 0, /* tp_call*/
8983 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008984 PyObject_GenericGetAttr, /* tp_getattro */
8985 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00008986 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008987 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8988 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008989 unicode_doc, /* tp_doc */
8990 0, /* tp_traverse */
8991 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008992 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008993 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008994 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008995 0, /* tp_iternext */
8996 unicode_methods, /* tp_methods */
8997 0, /* tp_members */
8998 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00008999 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009000 0, /* tp_dict */
9001 0, /* tp_descr_get */
9002 0, /* tp_descr_set */
9003 0, /* tp_dictoffset */
9004 0, /* tp_init */
9005 0, /* tp_alloc */
9006 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009007 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008};
9009
9010/* Initialize the Unicode implementation */
9011
Thomas Wouters78890102000-07-22 19:25:51 +00009012void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009014 int i;
9015
Thomas Wouters477c8d52006-05-27 19:21:47 +00009016 /* XXX - move this array to unicodectype.c ? */
9017 Py_UNICODE linebreak[] = {
9018 0x000A, /* LINE FEED */
9019 0x000D, /* CARRIAGE RETURN */
9020 0x001C, /* FILE SEPARATOR */
9021 0x001D, /* GROUP SEPARATOR */
9022 0x001E, /* RECORD SEPARATOR */
9023 0x0085, /* NEXT LINE */
9024 0x2028, /* LINE SEPARATOR */
9025 0x2029, /* PARAGRAPH SEPARATOR */
9026 };
9027
Fred Drakee4315f52000-05-09 19:53:39 +00009028 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009029 unicode_freelist = NULL;
9030 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009032 if (!unicode_empty)
9033 return;
9034
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009035 for (i = 0; i < 256; i++)
9036 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009037 if (PyType_Ready(&PyUnicode_Type) < 0)
9038 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009039
9040 /* initialize the linebreak bloom filter */
9041 bloom_linebreak = make_bloom_mask(
9042 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9043 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009044
9045 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046}
9047
9048/* Finalize the Unicode implementation */
9049
9050void
Thomas Wouters78890102000-07-22 19:25:51 +00009051_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009053 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009054 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009056 Py_XDECREF(unicode_empty);
9057 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009058
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009059 for (i = 0; i < 256; i++) {
9060 if (unicode_latin1[i]) {
9061 Py_DECREF(unicode_latin1[i]);
9062 unicode_latin1[i] = NULL;
9063 }
9064 }
9065
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009066 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067 PyUnicodeObject *v = u;
9068 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00009069 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00009070 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00009071 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009072 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009074 unicode_freelist = NULL;
9075 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009077
Walter Dörwald16807132007-05-25 13:52:07 +00009078void
9079PyUnicode_InternInPlace(PyObject **p)
9080{
9081 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9082 PyObject *t;
9083 if (s == NULL || !PyUnicode_Check(s))
9084 Py_FatalError(
9085 "PyUnicode_InternInPlace: unicode strings only please!");
9086 /* If it's a subclass, we don't really know what putting
9087 it in the interned dict might do. */
9088 if (!PyUnicode_CheckExact(s))
9089 return;
9090 if (PyUnicode_CHECK_INTERNED(s))
9091 return;
9092 if (interned == NULL) {
9093 interned = PyDict_New();
9094 if (interned == NULL) {
9095 PyErr_Clear(); /* Don't leave an exception */
9096 return;
9097 }
9098 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009099 /* It might be that the GetItem call fails even
9100 though the key is present in the dictionary,
9101 namely when this happens during a stack overflow. */
9102 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009103 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009104 Py_END_ALLOW_RECURSION
9105
Walter Dörwald16807132007-05-25 13:52:07 +00009106 if (t) {
9107 Py_INCREF(t);
9108 Py_DECREF(*p);
9109 *p = t;
9110 return;
9111 }
9112
Martin v. Löwis5b222132007-06-10 09:51:05 +00009113 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009114 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9115 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009116 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009117 return;
9118 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009119 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009120 /* The two references in interned are not counted by refcnt.
9121 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009122 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009123 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9124}
9125
9126void
9127PyUnicode_InternImmortal(PyObject **p)
9128{
9129 PyUnicode_InternInPlace(p);
9130 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9131 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9132 Py_INCREF(*p);
9133 }
9134}
9135
9136PyObject *
9137PyUnicode_InternFromString(const char *cp)
9138{
9139 PyObject *s = PyUnicode_FromString(cp);
9140 if (s == NULL)
9141 return NULL;
9142 PyUnicode_InternInPlace(&s);
9143 return s;
9144}
9145
9146void _Py_ReleaseInternedUnicodeStrings(void)
9147{
9148 PyObject *keys;
9149 PyUnicodeObject *s;
9150 Py_ssize_t i, n;
9151 Py_ssize_t immortal_size = 0, mortal_size = 0;
9152
9153 if (interned == NULL || !PyDict_Check(interned))
9154 return;
9155 keys = PyDict_Keys(interned);
9156 if (keys == NULL || !PyList_Check(keys)) {
9157 PyErr_Clear();
9158 return;
9159 }
9160
9161 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9162 detector, interned unicode strings are not forcibly deallocated;
9163 rather, we give them their stolen references back, and then clear
9164 and DECREF the interned dict. */
9165
9166 n = PyList_GET_SIZE(keys);
9167 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9168 n);
9169 for (i = 0; i < n; i++) {
9170 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9171 switch (s->state) {
9172 case SSTATE_NOT_INTERNED:
9173 /* XXX Shouldn't happen */
9174 break;
9175 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009176 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009177 immortal_size += s->length;
9178 break;
9179 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009180 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009181 mortal_size += s->length;
9182 break;
9183 default:
9184 Py_FatalError("Inconsistent interned string state.");
9185 }
9186 s->state = SSTATE_NOT_INTERNED;
9187 }
9188 fprintf(stderr, "total size of all interned strings: "
9189 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9190 "mortal/immortal\n", mortal_size, immortal_size);
9191 Py_DECREF(keys);
9192 PyDict_Clear(interned);
9193 Py_DECREF(interned);
9194 interned = NULL;
9195}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009196
9197
9198/********************* Unicode Iterator **************************/
9199
9200typedef struct {
9201 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009202 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009203 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9204} unicodeiterobject;
9205
9206static void
9207unicodeiter_dealloc(unicodeiterobject *it)
9208{
9209 _PyObject_GC_UNTRACK(it);
9210 Py_XDECREF(it->it_seq);
9211 PyObject_GC_Del(it);
9212}
9213
9214static int
9215unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9216{
9217 Py_VISIT(it->it_seq);
9218 return 0;
9219}
9220
9221static PyObject *
9222unicodeiter_next(unicodeiterobject *it)
9223{
9224 PyUnicodeObject *seq;
9225 PyObject *item;
9226
9227 assert(it != NULL);
9228 seq = it->it_seq;
9229 if (seq == NULL)
9230 return NULL;
9231 assert(PyUnicode_Check(seq));
9232
9233 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009234 item = PyUnicode_FromUnicode(
9235 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009236 if (item != NULL)
9237 ++it->it_index;
9238 return item;
9239 }
9240
9241 Py_DECREF(seq);
9242 it->it_seq = NULL;
9243 return NULL;
9244}
9245
9246static PyObject *
9247unicodeiter_len(unicodeiterobject *it)
9248{
9249 Py_ssize_t len = 0;
9250 if (it->it_seq)
9251 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9252 return PyInt_FromSsize_t(len);
9253}
9254
9255PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9256
9257static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009258 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9259 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009260 {NULL, NULL} /* sentinel */
9261};
9262
9263PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009264 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009265 "unicodeiterator", /* tp_name */
9266 sizeof(unicodeiterobject), /* tp_basicsize */
9267 0, /* tp_itemsize */
9268 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009269 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009270 0, /* tp_print */
9271 0, /* tp_getattr */
9272 0, /* tp_setattr */
9273 0, /* tp_compare */
9274 0, /* tp_repr */
9275 0, /* tp_as_number */
9276 0, /* tp_as_sequence */
9277 0, /* tp_as_mapping */
9278 0, /* tp_hash */
9279 0, /* tp_call */
9280 0, /* tp_str */
9281 PyObject_GenericGetAttr, /* tp_getattro */
9282 0, /* tp_setattro */
9283 0, /* tp_as_buffer */
9284 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9285 0, /* tp_doc */
9286 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9287 0, /* tp_clear */
9288 0, /* tp_richcompare */
9289 0, /* tp_weaklistoffset */
9290 PyObject_SelfIter, /* tp_iter */
9291 (iternextfunc)unicodeiter_next, /* tp_iternext */
9292 unicodeiter_methods, /* tp_methods */
9293 0,
9294};
9295
9296static PyObject *
9297unicode_iter(PyObject *seq)
9298{
9299 unicodeiterobject *it;
9300
9301 if (!PyUnicode_Check(seq)) {
9302 PyErr_BadInternalCall();
9303 return NULL;
9304 }
9305 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9306 if (it == NULL)
9307 return NULL;
9308 it->it_index = 0;
9309 Py_INCREF(seq);
9310 it->it_seq = (PyUnicodeObject *)seq;
9311 _PyObject_GC_TRACK(it);
9312 return (PyObject *)it;
9313}
9314
Martin v. Löwis5b222132007-06-10 09:51:05 +00009315size_t
9316Py_UNICODE_strlen(const Py_UNICODE *u)
9317{
9318 int res = 0;
9319 while(*u++)
9320 res++;
9321 return res;
9322}
9323
9324Py_UNICODE*
9325Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9326{
9327 Py_UNICODE *u = s1;
9328 while ((*u++ = *s2++));
9329 return s1;
9330}
9331
9332Py_UNICODE*
9333Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9334{
9335 Py_UNICODE *u = s1;
9336 while ((*u++ = *s2++))
9337 if (n-- == 0)
9338 break;
9339 return s1;
9340}
9341
9342int
9343Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9344{
9345 while (*s1 && *s2 && *s1 == *s2)
9346 s1++, s2++;
9347 if (*s1 && *s2)
9348 return (*s1 < *s2) ? -1 : +1;
9349 if (*s1)
9350 return 1;
9351 if (*s2)
9352 return -1;
9353 return 0;
9354}
9355
9356Py_UNICODE*
9357Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9358{
9359 const Py_UNICODE *p;
9360 for (p = s; *p; p++)
9361 if (*p == c)
9362 return (Py_UNICODE*)p;
9363 return NULL;
9364}
9365
9366
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009367#ifdef __cplusplus
9368}
9369#endif
9370
9371
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009372/*
9373Local variables:
9374c-basic-offset: 4
9375indent-tabs-mode: nil
9376End:
9377*/