blob: 02b0c7ac0cfaf1b3c4680cd1888eb69f43c23a5d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Eric Smith8c663262007-08-25 02:26:07 +000049#include "formatter_unicode.h"
50
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000051#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000052#include <windows.h>
53#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000054
Guido van Rossumd57fd912000-03-10 22:53:23 +000055/* Limit for the Unicode object free list */
56
57#define MAX_UNICODE_FREELIST_SIZE 1024
58
59/* Limit for the Unicode object free list stay alive optimization.
60
61 The implementation will keep allocated Unicode memory intact for
62 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000063 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Barry Warsaw51ac5802000-03-20 16:36:48 +000065 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000067 malloc()-overhead) bytes of unused garbage.
68
69 Setting the limit to 0 effectively turns the feature off.
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071 Note: This is an experimental feature ! If you get core dumps when
72 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000073
74*/
75
Guido van Rossumfd4b9572000-04-10 13:51:10 +000076#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
78/* Endianness switches; defaults to little endian */
79
80#ifdef WORDS_BIGENDIAN
81# define BYTEORDER_IS_BIG_ENDIAN
82#else
83# define BYTEORDER_IS_LITTLE_ENDIAN
84#endif
85
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086/* --- Globals ------------------------------------------------------------
87
88 The globals are initialized by the _PyUnicode_Init() API and should
89 not be used before calling that API.
90
91*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093
94#ifdef __cplusplus
95extern "C" {
96#endif
97
Walter Dörwald16807132007-05-25 13:52:07 +000098/* This dictionary holds all interned unicode strings. Note that references
99 to strings in this dictionary are *not* counted in the string's ob_refcnt.
100 When the interned string reaches a refcnt of 0 the string deallocation
101 function will delete the reference from this dictionary.
102
103 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000104 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000105*/
106static PyObject *interned;
107
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000109static PyUnicodeObject *unicode_freelist;
110static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000112/* The empty Unicode object is shared to improve performance. */
113static PyUnicodeObject *unicode_empty;
114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117static PyUnicodeObject *unicode_latin1[256];
118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000120 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000121 PyUnicode_GetDefaultEncoding() API to access this global.
122
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000123 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000124 hard coded default!
125*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000126static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000128Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000129PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000130{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000131#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000132 return 0x10FFFF;
133#else
134 /* This is actually an illegal character, so it should
135 not be passed to unichr. */
136 return 0xFFFF;
137#endif
138}
139
Thomas Wouters477c8d52006-05-27 19:21:47 +0000140/* --- Bloom Filters ----------------------------------------------------- */
141
142/* stuff to implement simple "bloom filters" for Unicode characters.
143 to keep things simple, we use a single bitmask, using the least 5
144 bits from each unicode characters as the bit index. */
145
146/* the linebreak mask is set up by Unicode_Init below */
147
148#define BLOOM_MASK unsigned long
149
150static BLOOM_MASK bloom_linebreak;
151
152#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
153
154#define BLOOM_LINEBREAK(ch)\
155 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
156
157Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
158{
159 /* calculate simple bloom-style bitmask for a given unicode string */
160
161 long mask;
162 Py_ssize_t i;
163
164 mask = 0;
165 for (i = 0; i < len; i++)
166 mask |= (1 << (ptr[i] & 0x1F));
167
168 return mask;
169}
170
171Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
172{
173 Py_ssize_t i;
174
175 for (i = 0; i < setlen; i++)
176 if (set[i] == chr)
177 return 1;
178
179 return 0;
180}
181
182#define BLOOM_MEMBER(mask, chr, set, setlen)\
183 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
184
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185/* --- Unicode Object ----------------------------------------------------- */
186
187static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000189 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190{
191 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000192
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000193 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 /* Resizing shared object (unicode_empty or single character
198 objects) in-place is not allowed. Use PyUnicode_Resize()
199 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 if (unicode == unicode_empty ||
202 (unicode->length == 1 &&
203 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000206 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 return -1;
208 }
209
Thomas Wouters477c8d52006-05-27 19:21:47 +0000210 /* We allocate one more byte to make sure the string is Ux0000 terminated.
211 The overallocation is also used by fastsearch, which assumes that it's
212 safe to look at str[length] (without making any assumptions about what
213 it contains). */
214
Guido van Rossumd57fd912000-03-10 22:53:23 +0000215 oldstr = unicode->str;
216 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
217 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 PyErr_NoMemory();
220 return -1;
221 }
222 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000223 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000225 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000227 if (unicode->defenc) {
228 Py_DECREF(unicode->defenc);
229 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 }
231 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000232
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 return 0;
234}
235
236/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000237 Ux0000 terminated; some code (e.g. new_identifier)
238 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239
240 XXX This allocator could further be enhanced by assuring that the
241 free list never reduces its size below 1.
242
243*/
244
245static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 register PyUnicodeObject *unicode;
249
Thomas Wouters477c8d52006-05-27 19:21:47 +0000250 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (length == 0 && unicode_empty != NULL) {
252 Py_INCREF(unicode_empty);
253 return unicode_empty;
254 }
255
256 /* Unicode freelist & memory allocation */
257 if (unicode_freelist) {
258 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000259 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Keep-Alive optimization: we only upsize the buffer,
263 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000264 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000265 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000266 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 }
269 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000270 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000272 }
273 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 }
275 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000276 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (unicode == NULL)
278 return NULL;
279 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
280 }
281
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000282 if (!unicode->str) {
283 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000284 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000285 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000286 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000287 * the caller fails before initializing str -- unicode_resize()
288 * reads str[0], and the Keep-Alive optimization can keep memory
289 * allocated for str alive across a call to unicode_dealloc(unicode).
290 * We don't want unicode_resize to read uninitialized memory in
291 * that case.
292 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000293 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000297 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000298 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000300
301 onError:
302 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000303 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305}
306
307static
Guido van Rossum9475a232001-10-05 20:51:39 +0000308void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309{
Walter Dörwald16807132007-05-25 13:52:07 +0000310 switch (PyUnicode_CHECK_INTERNED(unicode)) {
311 case SSTATE_NOT_INTERNED:
312 break;
313
314 case SSTATE_INTERNED_MORTAL:
315 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000316 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000317 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
318 Py_FatalError(
319 "deletion of interned unicode string failed");
320 break;
321
322 case SSTATE_INTERNED_IMMORTAL:
323 Py_FatalError("Immortal interned unicode string died.");
324
325 default:
326 Py_FatalError("Inconsistent interned unicode string state.");
327 }
328
Guido van Rossum604ddf82001-12-06 20:03:56 +0000329 if (PyUnicode_CheckExact(unicode) &&
330 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000331 /* Keep-Alive optimization */
332 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000333 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 unicode->str = NULL;
335 unicode->length = 0;
336 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000337 if (unicode->defenc) {
338 Py_DECREF(unicode->defenc);
339 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000340 }
341 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 *(PyUnicodeObject **)unicode = unicode_freelist;
343 unicode_freelist = unicode;
344 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000347 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000348 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000349 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 }
351}
352
Martin v. Löwis18e16552006-02-15 17:27:45 +0000353int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000354{
355 register PyUnicodeObject *v;
356
357 /* Argument checks */
358 if (unicode == NULL) {
359 PyErr_BadInternalCall();
360 return -1;
361 }
362 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000363 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 PyErr_BadInternalCall();
365 return -1;
366 }
367
368 /* Resizing unicode_empty and single character objects is not
369 possible since these are being shared. We simply return a fresh
370 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000371 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000372 (v == unicode_empty || v->length == 1)) {
373 PyUnicodeObject *w = _PyUnicode_New(length);
374 if (w == NULL)
375 return -1;
376 Py_UNICODE_COPY(w->str, v->str,
377 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000378 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 *unicode = (PyObject *)w;
380 return 0;
381 }
382
383 /* Note that we don't have to modify *unicode for unshared Unicode
384 objects, since we can modify them in-place. */
385 return unicode_resize(v, length);
386}
387
388/* Internal API for use in unicodeobject.c only ! */
389#define _PyUnicode_Resize(unicodevar, length) \
390 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
391
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000393 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394{
395 PyUnicodeObject *unicode;
396
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000397 /* If the Unicode data is known at construction time, we can apply
398 some optimizations which share commonly used objects. */
399 if (u != NULL) {
400
401 /* Optimization for empty strings */
402 if (size == 0 && unicode_empty != NULL) {
403 Py_INCREF(unicode_empty);
404 return (PyObject *)unicode_empty;
405 }
406
407 /* Single character Unicode objects in the Latin-1 range are
408 shared when using this constructor */
409 if (size == 1 && *u < 256) {
410 unicode = unicode_latin1[*u];
411 if (!unicode) {
412 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000413 if (!unicode)
414 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000415 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000416 unicode_latin1[*u] = unicode;
417 }
418 Py_INCREF(unicode);
419 return (PyObject *)unicode;
420 }
421 }
Tim Petersced69f82003-09-16 20:30:58 +0000422
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 unicode = _PyUnicode_New(size);
424 if (!unicode)
425 return NULL;
426
427 /* Copy the Unicode data into the new object */
428 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430
431 return (PyObject *)unicode;
432}
433
Walter Dörwaldd2034312007-05-18 16:29:38 +0000434PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000435{
436 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000437 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000438 some optimizations which share commonly used objects.
439 Also, this means the input must be UTF-8, so fall back to the
440 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000441 if (u != NULL) {
442
443 /* Optimization for empty strings */
444 if (size == 0 && unicode_empty != NULL) {
445 Py_INCREF(unicode_empty);
446 return (PyObject *)unicode_empty;
447 }
448
Martin v. Löwis9c121062007-08-05 20:26:11 +0000449 /* Single characters are shared when using this constructor.
450 Restrict to ASCII, since the input must be UTF-8. */
451 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000452 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000453 if (!unicode) {
454 unicode = _PyUnicode_New(1);
455 if (!unicode)
456 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000457 unicode->str[0] = Py_CHARMASK(*u);
458 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000459 }
460 Py_INCREF(unicode);
461 return (PyObject *)unicode;
462 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000463
464 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000465 }
466
Walter Dörwald55507312007-05-18 13:12:10 +0000467 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000468 if (!unicode)
469 return NULL;
470
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000471 return (PyObject *)unicode;
472}
473
Walter Dörwaldd2034312007-05-18 16:29:38 +0000474PyObject *PyUnicode_FromString(const char *u)
475{
476 size_t size = strlen(u);
477 if (size > PY_SSIZE_T_MAX) {
478 PyErr_SetString(PyExc_OverflowError, "input too long");
479 return NULL;
480 }
481
482 return PyUnicode_FromStringAndSize(u, size);
483}
484
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485#ifdef HAVE_WCHAR_H
486
487PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000488 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489{
490 PyUnicodeObject *unicode;
491
492 if (w == NULL) {
493 PyErr_BadInternalCall();
494 return NULL;
495 }
496
497 unicode = _PyUnicode_New(size);
498 if (!unicode)
499 return NULL;
500
501 /* Copy the wchar_t data into the new object */
502#ifdef HAVE_USABLE_WCHAR_T
503 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000504#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 {
506 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000507 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000509 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510 *u++ = *w++;
511 }
512#endif
513
514 return (PyObject *)unicode;
515}
516
Walter Dörwald346737f2007-05-31 10:44:43 +0000517static void
518makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
519{
520 *fmt++ = '%';
521 if (width) {
522 if (zeropad)
523 *fmt++ = '0';
524 fmt += sprintf(fmt, "%d", width);
525 }
526 if (precision)
527 fmt += sprintf(fmt, ".%d", precision);
528 if (longflag)
529 *fmt++ = 'l';
530 else if (size_tflag) {
531 char *f = PY_FORMAT_SIZE_T;
532 while (*f)
533 *fmt++ = *f++;
534 }
535 *fmt++ = c;
536 *fmt = '\0';
537}
538
Walter Dörwaldd2034312007-05-18 16:29:38 +0000539#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
540
541PyObject *
542PyUnicode_FromFormatV(const char *format, va_list vargs)
543{
544 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000545 Py_ssize_t callcount = 0;
546 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000547 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000548 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000549 int width = 0;
550 int precision = 0;
551 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000552 const char* f;
553 Py_UNICODE *s;
554 PyObject *string;
555 /* used by sprintf */
556 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000557 /* use abuffer instead of buffer, if we need more space
558 * (which can happen if there's a format specifier with width). */
559 char *abuffer = NULL;
560 char *realbuffer;
561 Py_ssize_t abuffersize = 0;
562 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000563 const char *copy;
564
565#ifdef VA_LIST_IS_ARRAY
566 Py_MEMCPY(count, vargs, sizeof(va_list));
567#else
568#ifdef __va_copy
569 __va_copy(count, vargs);
570#else
571 count = vargs;
572#endif
573#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 /* step 1: count the number of %S/%R format specifications
Thomas Heller519a0422007-11-15 20:48:54 +0000575 * (we call PyObject_Str()/PyObject_Repr() for these objects
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000576 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000577 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000578 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000579 ++callcount;
580 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000581 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000582 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000583 if (callcount) {
584 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
585 if (!callresults) {
586 PyErr_NoMemory();
587 return NULL;
588 }
589 callresult = callresults;
590 }
591 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000592 for (f = format; *f; f++) {
593 if (*f == '%') {
594 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000595 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000596 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000597 width = (width*10) + *f++ - '0';
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000598 while (*++f && *f != '%' && !ISALPHA(*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000599 ;
600
601 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
602 * they don't affect the amount of space we reserve.
603 */
604 if ((*f == 'l' || *f == 'z') &&
605 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000606 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000607
608 switch (*f) {
609 case 'c':
610 (void)va_arg(count, int);
611 /* fall through... */
612 case '%':
613 n++;
614 break;
615 case 'd': case 'u': case 'i': case 'x':
616 (void) va_arg(count, int);
617 /* 20 bytes is enough to hold a 64-bit
618 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000619 This isn't enough for octal.
620 If a width is specified we need more
621 (which we allocate later). */
622 if (width < 20)
623 width = 20;
624 n += width;
625 if (abuffersize < width)
626 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000627 break;
628 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000629 {
630 /* UTF-8 */
631 unsigned char*s;
632 s = va_arg(count, unsigned char*);
633 while (*s) {
634 if (*s < 128) {
635 n++; s++;
636 } else if (*s < 0xc0) {
637 /* invalid UTF-8 */
638 n++; s++;
639 } else if (*s < 0xc0) {
640 n++;
641 s++; if(!*s)break;
642 s++;
643 } else if (*s < 0xe0) {
644 n++;
645 s++; if(!*s)break;
646 s++; if(!*s)break;
647 s++;
648 } else {
649 #ifdef Py_UNICODE_WIDE
650 n++;
651 #else
652 n+=2;
653 #endif
654 s++; if(!*s)break;
655 s++; if(!*s)break;
656 s++; if(!*s)break;
657 s++;
658 }
659 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000660 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000661 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000662 case 'U':
663 {
664 PyObject *obj = va_arg(count, PyObject *);
665 assert(obj && PyUnicode_Check(obj));
666 n += PyUnicode_GET_SIZE(obj);
667 break;
668 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000669 case 'V':
670 {
671 PyObject *obj = va_arg(count, PyObject *);
672 const char *str = va_arg(count, const char *);
673 assert(obj || str);
674 assert(!obj || PyUnicode_Check(obj));
675 if (obj)
676 n += PyUnicode_GET_SIZE(obj);
677 else
678 n += strlen(str);
679 break;
680 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000681 case 'S':
682 {
683 PyObject *obj = va_arg(count, PyObject *);
684 PyObject *str;
685 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000686 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000687 if (!str)
688 goto fail;
689 n += PyUnicode_GET_SIZE(str);
690 /* Remember the str and switch to the next slot */
691 *callresult++ = str;
692 break;
693 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000694 case 'R':
695 {
696 PyObject *obj = va_arg(count, PyObject *);
697 PyObject *repr;
698 assert(obj);
699 repr = PyObject_Repr(obj);
700 if (!repr)
701 goto fail;
702 n += PyUnicode_GET_SIZE(repr);
703 /* Remember the repr and switch to the next slot */
704 *callresult++ = repr;
705 break;
706 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000707 case 'p':
708 (void) va_arg(count, int);
709 /* maximum 64-bit pointer representation:
710 * 0xffffffffffffffff
711 * so 19 characters is enough.
712 * XXX I count 18 -- what's the extra for?
713 */
714 n += 19;
715 break;
716 default:
717 /* if we stumble upon an unknown
718 formatting code, copy the rest of
719 the format string to the output
720 string. (we cannot just skip the
721 code, since there's no way to know
722 what's in the argument list) */
723 n += strlen(p);
724 goto expand;
725 }
726 } else
727 n++;
728 }
729 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000730 if (abuffersize > 20) {
731 abuffer = PyMem_Malloc(abuffersize);
732 if (!abuffer) {
733 PyErr_NoMemory();
734 goto fail;
735 }
736 realbuffer = abuffer;
737 }
738 else
739 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000740 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000741 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000742 we don't have to resize the string.
743 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 string = PyUnicode_FromUnicode(NULL, n);
745 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000746 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000747
748 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000749 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750
751 for (f = format; *f; f++) {
752 if (*f == '%') {
753 const char* p = f++;
754 int longflag = 0;
755 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000756 zeropad = (*f == '0');
757 /* parse the width.precision part */
758 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000759 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 width = (width*10) + *f++ - '0';
761 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 if (*f == '.') {
763 f++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000764 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767 /* handle the long flag, but only for %ld and %lu.
768 others can be added when necessary. */
769 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
770 longflag = 1;
771 ++f;
772 }
773 /* handle the size_t flag. */
774 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
775 size_tflag = 1;
776 ++f;
777 }
778
779 switch (*f) {
780 case 'c':
781 *s++ = va_arg(vargs, int);
782 break;
783 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000784 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000785 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000786 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000787 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000788 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000789 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000790 sprintf(realbuffer, fmt, va_arg(vargs, int));
791 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000792 break;
793 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000794 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000795 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000796 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000798 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000799 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000800 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
801 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000802 break;
803 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000804 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
805 sprintf(realbuffer, fmt, va_arg(vargs, int));
806 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000807 break;
808 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000809 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
810 sprintf(realbuffer, fmt, va_arg(vargs, int));
811 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000812 break;
813 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000814 {
815 /* Parameter must be UTF-8 encoded.
816 In case of encoding errors, use
817 the replacement character. */
818 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000819 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000820 u = PyUnicode_DecodeUTF8(p, strlen(p),
821 "replace");
822 if (!u)
823 goto fail;
824 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
825 PyUnicode_GET_SIZE(u));
826 s += PyUnicode_GET_SIZE(u);
827 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000829 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000830 case 'U':
831 {
832 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000833 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
834 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
835 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000836 break;
837 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000838 case 'V':
839 {
840 PyObject *obj = va_arg(vargs, PyObject *);
841 const char *str = va_arg(vargs, const char *);
842 if (obj) {
843 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
844 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
845 s += size;
846 } else {
847 appendstring(str);
848 }
849 break;
850 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000851 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000852 case 'R':
853 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000854 Py_UNICODE *ucopy;
855 Py_ssize_t usize;
856 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000857 /* unused, since we already have the result */
858 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000859 ucopy = PyUnicode_AS_UNICODE(*callresult);
860 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000861 for (upos = 0; upos<usize;)
862 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000863 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000864 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000865 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000866 ++callresult;
867 break;
868 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000869 case 'p':
870 sprintf(buffer, "%p", va_arg(vargs, void*));
871 /* %p is ill-defined: ensure leading 0x. */
872 if (buffer[1] == 'X')
873 buffer[1] = 'x';
874 else if (buffer[1] != 'x') {
875 memmove(buffer+2, buffer, strlen(buffer)+1);
876 buffer[0] = '0';
877 buffer[1] = 'x';
878 }
879 appendstring(buffer);
880 break;
881 case '%':
882 *s++ = '%';
883 break;
884 default:
885 appendstring(p);
886 goto end;
887 }
888 } else
889 *s++ = *f;
890 }
891
892 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000893 if (callresults)
894 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000895 if (abuffer)
896 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000897 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
898 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000899 fail:
900 if (callresults) {
901 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000902 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000903 Py_DECREF(*callresult2);
904 ++callresult2;
905 }
906 PyMem_Free(callresults);
907 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000908 if (abuffer)
909 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000910 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000911}
912
913#undef appendstring
914
915PyObject *
916PyUnicode_FromFormat(const char *format, ...)
917{
918 PyObject* ret;
919 va_list vargs;
920
921#ifdef HAVE_STDARG_PROTOTYPES
922 va_start(vargs, format);
923#else
924 va_start(vargs);
925#endif
926 ret = PyUnicode_FromFormatV(format, vargs);
927 va_end(vargs);
928 return ret;
929}
930
Martin v. Löwis18e16552006-02-15 17:27:45 +0000931Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
932 wchar_t *w,
933 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934{
935 if (unicode == NULL) {
936 PyErr_BadInternalCall();
937 return -1;
938 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000939
940 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000942 size = PyUnicode_GET_SIZE(unicode) + 1;
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944#ifdef HAVE_USABLE_WCHAR_T
945 memcpy(w, unicode->str, size * sizeof(wchar_t));
946#else
947 {
948 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000949 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000950 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000951 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000952 *w++ = *u++;
953 }
954#endif
955
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000956 if (size > PyUnicode_GET_SIZE(unicode))
957 return PyUnicode_GET_SIZE(unicode);
958 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000959 return size;
960}
961
962#endif
963
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000964PyObject *PyUnicode_FromOrdinal(int ordinal)
965{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000966 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000967
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000968 if (ordinal < 0 || ordinal > 0x10ffff) {
969 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000970 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000971 return NULL;
972 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000973
974#ifndef Py_UNICODE_WIDE
975 if (ordinal > 0xffff) {
976 ordinal -= 0x10000;
977 s[0] = 0xD800 | (ordinal >> 10);
978 s[1] = 0xDC00 | (ordinal & 0x3FF);
979 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000980 }
981#endif
982
Hye-Shik Chang40574832004-04-06 07:24:51 +0000983 s[0] = (Py_UNICODE)ordinal;
984 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000985}
986
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987PyObject *PyUnicode_FromObject(register PyObject *obj)
988{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000989 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +0000990 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000991 if (PyUnicode_CheckExact(obj)) {
992 Py_INCREF(obj);
993 return obj;
994 }
995 if (PyUnicode_Check(obj)) {
996 /* For a Unicode subtype that's not a Unicode object,
997 return a true Unicode object with the same data. */
998 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
999 PyUnicode_GET_SIZE(obj));
1000 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001001 PyErr_Format(PyExc_TypeError,
1002 "Can't convert '%.100s' object to str implicitly",
1003 Py_Type(obj)->tp_name);
1004 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001005}
1006
1007PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1008 const char *encoding,
1009 const char *errors)
1010{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001011 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001012 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001013 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001014
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015 if (obj == NULL) {
1016 PyErr_BadInternalCall();
1017 return NULL;
1018 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001019
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001020 if (PyUnicode_Check(obj)) {
1021 PyErr_SetString(PyExc_TypeError,
1022 "decoding Unicode is not supported");
1023 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001024 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001025
1026 /* Coerce object */
1027 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001028 s = PyString_AS_STRING(obj);
1029 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001030 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001031 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1032 /* Overwrite the error message with something more useful in
1033 case of a TypeError. */
1034 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001035 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001036 "coercing to Unicode: need string or buffer, "
1037 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001038 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001039 goto onError;
1040 }
Tim Petersced69f82003-09-16 20:30:58 +00001041
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001042 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 if (len == 0) {
1044 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001045 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046 }
Tim Petersced69f82003-09-16 20:30:58 +00001047 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001048 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001049
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001050 return v;
1051
1052 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054}
1055
1056PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001057 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058 const char *encoding,
1059 const char *errors)
1060{
1061 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001062 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001063 char lower[20]; /* Enough for any encoding name we recognize */
1064 char *l;
1065 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066
1067 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001068 encoding = PyUnicode_GetDefaultEncoding();
1069
1070 /* Convert encoding to lower case and replace '_' with '-' in order to
1071 catch e.g. UTF_8 */
1072 e = encoding;
1073 l = lower;
1074 while (*e && l < &lower[(sizeof lower) - 2]) {
1075 if (ISUPPER(*e)) {
1076 *l++ = TOLOWER(*e++);
1077 }
1078 else if (*e == '_') {
1079 *l++ = '-';
1080 e++;
1081 }
1082 else {
1083 *l++ = *e++;
1084 }
1085 }
1086 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001087
1088 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001089 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001090 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001091 else if ((strcmp(lower, "latin-1") == 0) ||
1092 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001093 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001094#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001095 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001096 return PyUnicode_DecodeMBCS(s, size, errors);
1097#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001098 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001099 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001100 else if (strcmp(lower, "utf-16") == 0)
1101 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1102 else if (strcmp(lower, "utf-32") == 0)
1103 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001104
1105 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001106 buffer = NULL;
1107 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1108 goto onError;
1109 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (buffer == NULL)
1111 goto onError;
1112 unicode = PyCodec_Decode(buffer, encoding, errors);
1113 if (unicode == NULL)
1114 goto onError;
1115 if (!PyUnicode_Check(unicode)) {
1116 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001117 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001118 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119 Py_DECREF(unicode);
1120 goto onError;
1121 }
1122 Py_DECREF(buffer);
1123 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001124
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125 onError:
1126 Py_XDECREF(buffer);
1127 return NULL;
1128}
1129
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001130PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1131 const char *encoding,
1132 const char *errors)
1133{
1134 PyObject *v;
1135
1136 if (!PyUnicode_Check(unicode)) {
1137 PyErr_BadArgument();
1138 goto onError;
1139 }
1140
1141 if (encoding == NULL)
1142 encoding = PyUnicode_GetDefaultEncoding();
1143
1144 /* Decode via the codec registry */
1145 v = PyCodec_Decode(unicode, encoding, errors);
1146 if (v == NULL)
1147 goto onError;
1148 return v;
1149
1150 onError:
1151 return NULL;
1152}
1153
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001155 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 const char *encoding,
1157 const char *errors)
1158{
1159 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001160
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 unicode = PyUnicode_FromUnicode(s, size);
1162 if (unicode == NULL)
1163 return NULL;
1164 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1165 Py_DECREF(unicode);
1166 return v;
1167}
1168
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001169PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1170 const char *encoding,
1171 const char *errors)
1172{
1173 PyObject *v;
1174
1175 if (!PyUnicode_Check(unicode)) {
1176 PyErr_BadArgument();
1177 goto onError;
1178 }
1179
1180 if (encoding == NULL)
1181 encoding = PyUnicode_GetDefaultEncoding();
1182
1183 /* Encode via the codec registry */
1184 v = PyCodec_Encode(unicode, encoding, errors);
1185 if (v == NULL)
1186 goto onError;
1187 return v;
1188
1189 onError:
1190 return NULL;
1191}
1192
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1194 const char *encoding,
1195 const char *errors)
1196{
1197 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001198
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199 if (!PyUnicode_Check(unicode)) {
1200 PyErr_BadArgument();
1201 goto onError;
1202 }
Fred Drakee4315f52000-05-09 19:53:39 +00001203
Tim Petersced69f82003-09-16 20:30:58 +00001204 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001205 encoding = PyUnicode_GetDefaultEncoding();
1206
1207 /* Shortcuts for common default encodings */
1208 if (errors == NULL) {
1209 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001210 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001211 else if (strcmp(encoding, "latin-1") == 0)
1212 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001213#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1214 else if (strcmp(encoding, "mbcs") == 0)
1215 return PyUnicode_AsMBCSString(unicode);
1216#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001217 else if (strcmp(encoding, "ascii") == 0)
1218 return PyUnicode_AsASCIIString(unicode);
1219 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220
1221 /* Encode via the codec registry */
1222 v = PyCodec_Encode(unicode, encoding, errors);
1223 if (v == NULL)
1224 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001225 assert(PyString_Check(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001227
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228 onError:
1229 return NULL;
1230}
1231
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001232PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1233 const char *errors)
1234{
1235 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001236 if (v)
1237 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001238 if (errors != NULL)
1239 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001240 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001241 PyUnicode_GET_SIZE(unicode),
1242 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001243 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001244 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001245 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001246 return v;
1247}
1248
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001249PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001250PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001251 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001252 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1253}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001254
Christian Heimes5894ba72007-11-04 11:43:14 +00001255PyObject*
1256PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1257{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001258 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1259 can be undefined. If it is case, decode using UTF-8. The following assumes
1260 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1261 bootstrapping process where the codecs aren't ready yet.
1262 */
1263 if (Py_FileSystemDefaultEncoding) {
1264#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001265 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001266 return PyUnicode_DecodeMBCS(s, size, "replace");
1267 }
1268#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001269 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001270 return PyUnicode_DecodeUTF8(s, size, "replace");
1271 }
1272#endif
1273 return PyUnicode_Decode(s, size,
1274 Py_FileSystemDefaultEncoding,
1275 "replace");
1276 }
1277 else {
1278 return PyUnicode_DecodeUTF8(s, size, "replace");
1279 }
1280}
1281
Martin v. Löwis5b222132007-06-10 09:51:05 +00001282char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001283PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001284{
Christian Heimesf3863112007-11-22 07:46:41 +00001285 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001286 if (!PyUnicode_Check(unicode)) {
1287 PyErr_BadArgument();
1288 return NULL;
1289 }
Christian Heimesf3863112007-11-22 07:46:41 +00001290 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1291 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001292 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001293 if (psize != NULL)
Christian Heimesf3863112007-11-22 07:46:41 +00001294 *psize = PyString_GET_SIZE(bytes);
1295 return PyString_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001296}
1297
1298char*
1299PyUnicode_AsString(PyObject *unicode)
1300{
1301 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001302}
1303
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1305{
1306 if (!PyUnicode_Check(unicode)) {
1307 PyErr_BadArgument();
1308 goto onError;
1309 }
1310 return PyUnicode_AS_UNICODE(unicode);
1311
1312 onError:
1313 return NULL;
1314}
1315
Martin v. Löwis18e16552006-02-15 17:27:45 +00001316Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317{
1318 if (!PyUnicode_Check(unicode)) {
1319 PyErr_BadArgument();
1320 goto onError;
1321 }
1322 return PyUnicode_GET_SIZE(unicode);
1323
1324 onError:
1325 return -1;
1326}
1327
Thomas Wouters78890102000-07-22 19:25:51 +00001328const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001329{
1330 return unicode_default_encoding;
1331}
1332
1333int PyUnicode_SetDefaultEncoding(const char *encoding)
1334{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001335 if (strcmp(encoding, unicode_default_encoding) != 0) {
1336 PyErr_Format(PyExc_ValueError,
1337 "Can only set default encoding to %s",
1338 unicode_default_encoding);
1339 return -1;
1340 }
Fred Drakee4315f52000-05-09 19:53:39 +00001341 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001342}
1343
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344/* error handling callback helper:
1345 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001346 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001347 and adjust various state variables.
1348 return 0 on success, -1 on error
1349*/
1350
1351static
1352int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1353 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001354 const char **input, const char **inend, Py_ssize_t *startinpos,
1355 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001356 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001357{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001358 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001359
1360 PyObject *restuple = NULL;
1361 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001362 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001363 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001364 Py_ssize_t requiredsize;
1365 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001366 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001367 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001368 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001369 int res = -1;
1370
1371 if (*errorHandler == NULL) {
1372 *errorHandler = PyCodec_LookupError(errors);
1373 if (*errorHandler == NULL)
1374 goto onError;
1375 }
1376
1377 if (*exceptionObject == NULL) {
1378 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001379 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001380 if (*exceptionObject == NULL)
1381 goto onError;
1382 }
1383 else {
1384 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1385 goto onError;
1386 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1387 goto onError;
1388 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1389 goto onError;
1390 }
1391
1392 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1393 if (restuple == NULL)
1394 goto onError;
1395 if (!PyTuple_Check(restuple)) {
1396 PyErr_Format(PyExc_TypeError, &argparse[4]);
1397 goto onError;
1398 }
1399 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1400 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001401
1402 /* Copy back the bytes variables, which might have been modified by the
1403 callback */
1404 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1405 if (!inputobj)
1406 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001407 if (!PyString_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001408 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1409 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001410 *input = PyString_AS_STRING(inputobj);
1411 insize = PyString_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001412 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001413 /* we can DECREF safely, as the exception has another reference,
1414 so the object won't go away. */
1415 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001417 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001418 newpos = insize+newpos;
1419 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001420 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001421 goto onError;
1422 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001423
1424 /* need more space? (at least enough for what we
1425 have+the replacement+the rest of the string (starting
1426 at the new input position), so we won't have to check space
1427 when there are no errors in the rest of the string) */
1428 repptr = PyUnicode_AS_UNICODE(repunicode);
1429 repsize = PyUnicode_GET_SIZE(repunicode);
1430 requiredsize = *outpos + repsize + insize-newpos;
1431 if (requiredsize > outsize) {
1432 if (requiredsize<2*outsize)
1433 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001434 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 goto onError;
1436 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1437 }
1438 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001439 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001440 Py_UNICODE_COPY(*outptr, repptr, repsize);
1441 *outptr += repsize;
1442 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001444 /* we made it! */
1445 res = 0;
1446
1447 onError:
1448 Py_XDECREF(restuple);
1449 return res;
1450}
1451
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001452/* --- UTF-7 Codec -------------------------------------------------------- */
1453
1454/* see RFC2152 for details */
1455
Tim Petersced69f82003-09-16 20:30:58 +00001456static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001457char utf7_special[128] = {
1458 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1459 encoded:
1460 0 - not special
1461 1 - special
1462 2 - whitespace (optional)
1463 3 - RFC2152 Set O (optional) */
1464 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1465 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1466 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1468 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1469 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1470 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1471 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1472
1473};
1474
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001475/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1476 warnings about the comparison always being false; since
1477 utf7_special[0] is 1, we can safely make that one comparison
1478 true */
1479
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001481 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001482 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001483 (encodeO && (utf7_special[(c)] == 3)))
1484
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001485#define B64(n) \
1486 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1487#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001488 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001489#define UB64(c) \
1490 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1491 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001492
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001493#define ENCODE(out, ch, bits) \
1494 while (bits >= 6) { \
1495 *out++ = B64(ch >> (bits-6)); \
1496 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001497 }
1498
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001499#define DECODE(out, ch, bits, surrogate) \
1500 while (bits >= 16) { \
1501 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1502 bits -= 16; \
1503 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001504 /* We have already generated an error for the high surrogate \
1505 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001506 surrogate = 0; \
1507 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001508 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001509 it in a 16-bit character */ \
1510 surrogate = 1; \
1511 errmsg = "code pairs are not supported"; \
1512 goto utf7Error; \
1513 } else { \
1514 *out++ = outCh; \
1515 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001516 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001517
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001518PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001519 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001520 const char *errors)
1521{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001522 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1523}
1524
1525PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1526 Py_ssize_t size,
1527 const char *errors,
1528 Py_ssize_t *consumed)
1529{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001531 Py_ssize_t startinpos;
1532 Py_ssize_t endinpos;
1533 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001534 const char *e;
1535 PyUnicodeObject *unicode;
1536 Py_UNICODE *p;
1537 const char *errmsg = "";
1538 int inShift = 0;
1539 unsigned int bitsleft = 0;
1540 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001541 int surrogate = 0;
1542 PyObject *errorHandler = NULL;
1543 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001544
1545 unicode = _PyUnicode_New(size);
1546 if (!unicode)
1547 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001548 if (size == 0) {
1549 if (consumed)
1550 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001551 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001552 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553
1554 p = unicode->str;
1555 e = s + size;
1556
1557 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001558 Py_UNICODE ch;
1559 restart:
1560 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561
1562 if (inShift) {
1563 if ((ch == '-') || !B64CHAR(ch)) {
1564 inShift = 0;
1565 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001566
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1568 if (bitsleft >= 6) {
1569 /* The shift sequence has a partial character in it. If
1570 bitsleft < 6 then we could just classify it as padding
1571 but that is not the case here */
1572
1573 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001574 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001575 }
1576 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001577 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001578 here so indicate the potential of a misencoded character. */
1579
1580 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1581 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1582 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001583 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001584 }
1585
1586 if (ch == '-') {
1587 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001588 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001589 inShift = 1;
1590 }
1591 } else if (SPECIAL(ch,0,0)) {
1592 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001593 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594 } else {
1595 *p++ = ch;
1596 }
1597 } else {
1598 charsleft = (charsleft << 6) | UB64(ch);
1599 bitsleft += 6;
1600 s++;
1601 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1602 }
1603 }
1604 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001606 s++;
1607 if (s < e && *s == '-') {
1608 s++;
1609 *p++ = '+';
1610 } else
1611 {
1612 inShift = 1;
1613 bitsleft = 0;
1614 }
1615 }
1616 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001617 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001618 errmsg = "unexpected special character";
1619 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001620 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001621 }
1622 else {
1623 *p++ = ch;
1624 s++;
1625 }
1626 continue;
1627 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 outpos = p-PyUnicode_AS_UNICODE(unicode);
1629 endinpos = s-starts;
1630 if (unicode_decode_call_errorhandler(
1631 errors, &errorHandler,
1632 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001633 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001634 (PyObject **)&unicode, &outpos, &p))
1635 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636 }
1637
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001638 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001639 outpos = p-PyUnicode_AS_UNICODE(unicode);
1640 endinpos = size;
1641 if (unicode_decode_call_errorhandler(
1642 errors, &errorHandler,
1643 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001644 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001645 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001646 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001647 if (s < e)
1648 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001649 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001650 if (consumed) {
1651 if(inShift)
1652 *consumed = startinpos;
1653 else
1654 *consumed = s-starts;
1655 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001656
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001657 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001658 goto onError;
1659
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001660 Py_XDECREF(errorHandler);
1661 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662 return (PyObject *)unicode;
1663
1664onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001665 Py_XDECREF(errorHandler);
1666 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001667 Py_DECREF(unicode);
1668 return NULL;
1669}
1670
1671
1672PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001673 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001674 int encodeSetO,
1675 int encodeWhiteSpace,
1676 const char *errors)
1677{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001678 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001680 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001681 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001682 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001683 unsigned int bitsleft = 0;
1684 unsigned long charsleft = 0;
1685 char * out;
1686 char * start;
1687
1688 if (size == 0)
Christian Heimesf3863112007-11-22 07:46:41 +00001689 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001690
Walter Dörwald51ab4142007-05-05 14:43:36 +00001691 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001692 if (v == NULL)
1693 return NULL;
1694
Walter Dörwald51ab4142007-05-05 14:43:36 +00001695 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001696 for (;i < size; ++i) {
1697 Py_UNICODE ch = s[i];
1698
1699 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001700 if (ch == '+') {
1701 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001702 *out++ = '-';
1703 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1704 charsleft = ch;
1705 bitsleft = 16;
1706 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001707 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001708 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001709 } else {
1710 *out++ = (char) ch;
1711 }
1712 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001713 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1714 *out++ = B64(charsleft << (6-bitsleft));
1715 charsleft = 0;
1716 bitsleft = 0;
1717 /* Characters not in the BASE64 set implicitly unshift the sequence
1718 so no '-' is required, except if the character is itself a '-' */
1719 if (B64CHAR(ch) || ch == '-') {
1720 *out++ = '-';
1721 }
1722 inShift = 0;
1723 *out++ = (char) ch;
1724 } else {
1725 bitsleft += 16;
1726 charsleft = (charsleft << 16) | ch;
1727 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1728
1729 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001730 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001731 or '-' then the shift sequence will be terminated implicitly and we
1732 don't have to insert a '-'. */
1733
1734 if (bitsleft == 0) {
1735 if (i + 1 < size) {
1736 Py_UNICODE ch2 = s[i+1];
1737
1738 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001739
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001740 } else if (B64CHAR(ch2) || ch2 == '-') {
1741 *out++ = '-';
1742 inShift = 0;
1743 } else {
1744 inShift = 0;
1745 }
1746
1747 }
1748 else {
1749 *out++ = '-';
1750 inShift = 0;
1751 }
1752 }
Tim Petersced69f82003-09-16 20:30:58 +00001753 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001754 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001755 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001756 if (bitsleft) {
1757 *out++= B64(charsleft << (6-bitsleft) );
1758 *out++ = '-';
1759 }
1760
Guido van Rossum98297ee2007-11-06 21:34:58 +00001761 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start);
1762 Py_DECREF(v);
1763 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001764}
1765
1766#undef SPECIAL
1767#undef B64
1768#undef B64CHAR
1769#undef UB64
1770#undef ENCODE
1771#undef DECODE
1772
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773/* --- UTF-8 Codec -------------------------------------------------------- */
1774
Tim Petersced69f82003-09-16 20:30:58 +00001775static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776char utf8_code_length[256] = {
1777 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1778 illegal prefix. see RFC 2279 for details */
1779 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1780 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1781 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1782 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1783 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1784 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1785 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1786 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1787 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1788 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1789 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1790 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1791 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1792 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1793 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1794 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1795};
1796
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001798 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799 const char *errors)
1800{
Walter Dörwald69652032004-09-07 20:24:22 +00001801 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1802}
1803
1804PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001805 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001806 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001807 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001808{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001811 Py_ssize_t startinpos;
1812 Py_ssize_t endinpos;
1813 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 const char *e;
1815 PyUnicodeObject *unicode;
1816 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001817 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 PyObject *errorHandler = NULL;
1819 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001820
1821 /* Note: size will always be longer than the resulting Unicode
1822 character count */
1823 unicode = _PyUnicode_New(size);
1824 if (!unicode)
1825 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001826 if (size == 0) {
1827 if (consumed)
1828 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001830 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831
1832 /* Unpack UTF-8 encoded data */
1833 p = unicode->str;
1834 e = s + size;
1835
1836 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001837 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838
1839 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001840 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 s++;
1842 continue;
1843 }
1844
1845 n = utf8_code_length[ch];
1846
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001847 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001848 if (consumed)
1849 break;
1850 else {
1851 errmsg = "unexpected end of data";
1852 startinpos = s-starts;
1853 endinpos = size;
1854 goto utf8Error;
1855 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857
1858 switch (n) {
1859
1860 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001861 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 startinpos = s-starts;
1863 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001864 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865
1866 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001867 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 startinpos = s-starts;
1869 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001870 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871
1872 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001873 if ((s[1] & 0xc0) != 0x80) {
1874 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001875 startinpos = s-starts;
1876 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001877 goto utf8Error;
1878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001880 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001881 startinpos = s-starts;
1882 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001883 errmsg = "illegal encoding";
1884 goto utf8Error;
1885 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001887 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888 break;
1889
1890 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001891 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001892 (s[2] & 0xc0) != 0x80) {
1893 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001894 startinpos = s-starts;
1895 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001896 goto utf8Error;
1897 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001899 if (ch < 0x0800) {
1900 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001901 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001902
1903 XXX For wide builds (UCS-4) we should probably try
1904 to recombine the surrogates into a single code
1905 unit.
1906 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001907 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001908 startinpos = s-starts;
1909 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001910 goto utf8Error;
1911 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001913 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001914 break;
1915
1916 case 4:
1917 if ((s[1] & 0xc0) != 0x80 ||
1918 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001919 (s[3] & 0xc0) != 0x80) {
1920 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001921 startinpos = s-starts;
1922 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001923 goto utf8Error;
1924 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001925 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1926 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1927 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001928 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001929 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001930 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001931 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001932 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001933 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001934 startinpos = s-starts;
1935 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001936 goto utf8Error;
1937 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001938#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001939 *p++ = (Py_UNICODE)ch;
1940#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001941 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001942
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001943 /* translate from 10000..10FFFF to 0..FFFF */
1944 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001945
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001946 /* high surrogate = top 10 bits added to D800 */
1947 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001948
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001949 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001950 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001951#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952 break;
1953
1954 default:
1955 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001956 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001957 startinpos = s-starts;
1958 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960 }
1961 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001962 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001963
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001964 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001965 outpos = p-PyUnicode_AS_UNICODE(unicode);
1966 if (unicode_decode_call_errorhandler(
1967 errors, &errorHandler,
1968 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001969 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001970 (PyObject **)&unicode, &outpos, &p))
1971 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972 }
Walter Dörwald69652032004-09-07 20:24:22 +00001973 if (consumed)
1974 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975
1976 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001977 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978 goto onError;
1979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001980 Py_XDECREF(errorHandler);
1981 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982 return (PyObject *)unicode;
1983
1984onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 Py_XDECREF(errorHandler);
1986 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 Py_DECREF(unicode);
1988 return NULL;
1989}
1990
Tim Peters602f7402002-04-27 18:03:26 +00001991/* Allocation strategy: if the string is short, convert into a stack buffer
1992 and allocate exactly as much space needed at the end. Else allocate the
1993 maximum possible needed (4 result bytes per Unicode character), and return
1994 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001995*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001996PyObject *
1997PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001998 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001999 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000{
Tim Peters602f7402002-04-27 18:03:26 +00002001#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002002
Guido van Rossum98297ee2007-11-06 21:34:58 +00002003 Py_ssize_t i; /* index into s of next input byte */
2004 PyObject *result; /* result string object */
2005 char *p; /* next free byte in output buffer */
2006 Py_ssize_t nallocated; /* number of result bytes allocated */
2007 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002008 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002009
Tim Peters602f7402002-04-27 18:03:26 +00002010 assert(s != NULL);
2011 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002012
Tim Peters602f7402002-04-27 18:03:26 +00002013 if (size <= MAX_SHORT_UNICHARS) {
2014 /* Write into the stack buffer; nallocated can't overflow.
2015 * At the end, we'll allocate exactly as much heap space as it
2016 * turns out we need.
2017 */
2018 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002019 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002020 p = stackbuf;
2021 }
2022 else {
2023 /* Overallocate on the heap, and give the excess back at the end. */
2024 nallocated = size * 4;
2025 if (nallocated / 4 != size) /* overflow! */
2026 return PyErr_NoMemory();
Guido van Rossum98297ee2007-11-06 21:34:58 +00002027 result = PyString_FromStringAndSize(NULL, nallocated);
2028 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002029 return NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00002030 p = PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002031 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002032
Tim Peters602f7402002-04-27 18:03:26 +00002033 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002034 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002035
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002036 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002037 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002039
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002041 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002042 *p++ = (char)(0xc0 | (ch >> 6));
2043 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002044 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002045 else {
Tim Peters602f7402002-04-27 18:03:26 +00002046 /* Encode UCS2 Unicode ordinals */
2047 if (ch < 0x10000) {
2048 /* Special case: check for high surrogate */
2049 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2050 Py_UCS4 ch2 = s[i];
2051 /* Check for low surrogate and combine the two to
2052 form a UCS4 value */
2053 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002054 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002055 i++;
2056 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002057 }
Tim Peters602f7402002-04-27 18:03:26 +00002058 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002059 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002060 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002061 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2062 *p++ = (char)(0x80 | (ch & 0x3f));
2063 continue;
2064 }
2065encodeUCS4:
2066 /* Encode UCS4 Unicode ordinals */
2067 *p++ = (char)(0xf0 | (ch >> 18));
2068 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2069 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2070 *p++ = (char)(0x80 | (ch & 0x3f));
2071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002073
Guido van Rossum98297ee2007-11-06 21:34:58 +00002074 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002075 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002076 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002077 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002078 result = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002079 }
2080 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002081 /* Cut back to size actually needed. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00002082 nneeded = p - PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002083 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002084 _PyString_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002085 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002086 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002087
Tim Peters602f7402002-04-27 18:03:26 +00002088#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089}
2090
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2092{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 if (!PyUnicode_Check(unicode)) {
2094 PyErr_BadArgument();
2095 return NULL;
2096 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002097 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2098 PyUnicode_GET_SIZE(unicode),
2099 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100}
2101
Walter Dörwald41980ca2007-08-16 21:55:45 +00002102/* --- UTF-32 Codec ------------------------------------------------------- */
2103
2104PyObject *
2105PyUnicode_DecodeUTF32(const char *s,
2106 Py_ssize_t size,
2107 const char *errors,
2108 int *byteorder)
2109{
2110 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2111}
2112
2113PyObject *
2114PyUnicode_DecodeUTF32Stateful(const char *s,
2115 Py_ssize_t size,
2116 const char *errors,
2117 int *byteorder,
2118 Py_ssize_t *consumed)
2119{
2120 const char *starts = s;
2121 Py_ssize_t startinpos;
2122 Py_ssize_t endinpos;
2123 Py_ssize_t outpos;
2124 PyUnicodeObject *unicode;
2125 Py_UNICODE *p;
2126#ifndef Py_UNICODE_WIDE
2127 int i, pairs;
2128#else
2129 const int pairs = 0;
2130#endif
2131 const unsigned char *q, *e;
2132 int bo = 0; /* assume native ordering by default */
2133 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002134 /* Offsets from q for retrieving bytes in the right order. */
2135#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2136 int iorder[] = {0, 1, 2, 3};
2137#else
2138 int iorder[] = {3, 2, 1, 0};
2139#endif
2140 PyObject *errorHandler = NULL;
2141 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002142 /* On narrow builds we split characters outside the BMP into two
2143 codepoints => count how much extra space we need. */
2144#ifndef Py_UNICODE_WIDE
2145 for (i = pairs = 0; i < size/4; i++)
2146 if (((Py_UCS4 *)s)[i] >= 0x10000)
2147 pairs++;
2148#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002149
2150 /* This might be one to much, because of a BOM */
2151 unicode = _PyUnicode_New((size+3)/4+pairs);
2152 if (!unicode)
2153 return NULL;
2154 if (size == 0)
2155 return (PyObject *)unicode;
2156
2157 /* Unpack UTF-32 encoded data */
2158 p = unicode->str;
2159 q = (unsigned char *)s;
2160 e = q + size;
2161
2162 if (byteorder)
2163 bo = *byteorder;
2164
2165 /* Check for BOM marks (U+FEFF) in the input and adjust current
2166 byte order setting accordingly. In native mode, the leading BOM
2167 mark is skipped, in all other modes, it is copied to the output
2168 stream as-is (giving a ZWNBSP character). */
2169 if (bo == 0) {
2170 if (size >= 4) {
2171 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2172 (q[iorder[1]] << 8) | q[iorder[0]];
2173#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2174 if (bom == 0x0000FEFF) {
2175 q += 4;
2176 bo = -1;
2177 }
2178 else if (bom == 0xFFFE0000) {
2179 q += 4;
2180 bo = 1;
2181 }
2182#else
2183 if (bom == 0x0000FEFF) {
2184 q += 4;
2185 bo = 1;
2186 }
2187 else if (bom == 0xFFFE0000) {
2188 q += 4;
2189 bo = -1;
2190 }
2191#endif
2192 }
2193 }
2194
2195 if (bo == -1) {
2196 /* force LE */
2197 iorder[0] = 0;
2198 iorder[1] = 1;
2199 iorder[2] = 2;
2200 iorder[3] = 3;
2201 }
2202 else if (bo == 1) {
2203 /* force BE */
2204 iorder[0] = 3;
2205 iorder[1] = 2;
2206 iorder[2] = 1;
2207 iorder[3] = 0;
2208 }
2209
2210 while (q < e) {
2211 Py_UCS4 ch;
2212 /* remaining bytes at the end? (size should be divisible by 4) */
2213 if (e-q<4) {
2214 if (consumed)
2215 break;
2216 errmsg = "truncated data";
2217 startinpos = ((const char *)q)-starts;
2218 endinpos = ((const char *)e)-starts;
2219 goto utf32Error;
2220 /* The remaining input chars are ignored if the callback
2221 chooses to skip the input */
2222 }
2223 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2224 (q[iorder[1]] << 8) | q[iorder[0]];
2225
2226 if (ch >= 0x110000)
2227 {
2228 errmsg = "codepoint not in range(0x110000)";
2229 startinpos = ((const char *)q)-starts;
2230 endinpos = startinpos+4;
2231 goto utf32Error;
2232 }
2233#ifndef Py_UNICODE_WIDE
2234 if (ch >= 0x10000)
2235 {
2236 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2237 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2238 }
2239 else
2240#endif
2241 *p++ = ch;
2242 q += 4;
2243 continue;
2244 utf32Error:
2245 outpos = p-PyUnicode_AS_UNICODE(unicode);
2246 if (unicode_decode_call_errorhandler(
2247 errors, &errorHandler,
2248 "utf32", errmsg,
2249 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2250 (PyObject **)&unicode, &outpos, &p))
2251 goto onError;
2252 }
2253
2254 if (byteorder)
2255 *byteorder = bo;
2256
2257 if (consumed)
2258 *consumed = (const char *)q-starts;
2259
2260 /* Adjust length */
2261 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2262 goto onError;
2263
2264 Py_XDECREF(errorHandler);
2265 Py_XDECREF(exc);
2266 return (PyObject *)unicode;
2267
2268onError:
2269 Py_DECREF(unicode);
2270 Py_XDECREF(errorHandler);
2271 Py_XDECREF(exc);
2272 return NULL;
2273}
2274
2275PyObject *
2276PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2277 Py_ssize_t size,
2278 const char *errors,
2279 int byteorder)
2280{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002281 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002282 unsigned char *p;
2283#ifndef Py_UNICODE_WIDE
2284 int i, pairs;
2285#else
2286 const int pairs = 0;
2287#endif
2288 /* Offsets from p for storing byte pairs in the right order. */
2289#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2290 int iorder[] = {0, 1, 2, 3};
2291#else
2292 int iorder[] = {3, 2, 1, 0};
2293#endif
2294
2295#define STORECHAR(CH) \
2296 do { \
2297 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2298 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2299 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2300 p[iorder[0]] = (CH) & 0xff; \
2301 p += 4; \
2302 } while(0)
2303
2304 /* In narrow builds we can output surrogate pairs as one codepoint,
2305 so we need less space. */
2306#ifndef Py_UNICODE_WIDE
2307 for (i = pairs = 0; i < size-1; i++)
2308 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2309 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2310 pairs++;
2311#endif
2312 v = PyBytes_FromStringAndSize(NULL,
2313 4 * (size - pairs + (byteorder == 0)));
2314 if (v == NULL)
2315 return NULL;
2316
2317 p = (unsigned char *)PyBytes_AS_STRING(v);
2318 if (byteorder == 0)
2319 STORECHAR(0xFEFF);
2320 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002321 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002322
2323 if (byteorder == -1) {
2324 /* force LE */
2325 iorder[0] = 0;
2326 iorder[1] = 1;
2327 iorder[2] = 2;
2328 iorder[3] = 3;
2329 }
2330 else if (byteorder == 1) {
2331 /* force BE */
2332 iorder[0] = 3;
2333 iorder[1] = 2;
2334 iorder[2] = 1;
2335 iorder[3] = 0;
2336 }
2337
2338 while (size-- > 0) {
2339 Py_UCS4 ch = *s++;
2340#ifndef Py_UNICODE_WIDE
2341 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2342 Py_UCS4 ch2 = *s;
2343 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2344 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2345 s++;
2346 size--;
2347 }
2348 }
2349#endif
2350 STORECHAR(ch);
2351 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002352
2353 done:
2354 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
2355 Py_DECREF(v);
2356 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002357#undef STORECHAR
2358}
2359
2360PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2361{
2362 if (!PyUnicode_Check(unicode)) {
2363 PyErr_BadArgument();
2364 return NULL;
2365 }
2366 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2367 PyUnicode_GET_SIZE(unicode),
2368 NULL,
2369 0);
2370}
2371
Guido van Rossumd57fd912000-03-10 22:53:23 +00002372/* --- UTF-16 Codec ------------------------------------------------------- */
2373
Tim Peters772747b2001-08-09 22:21:55 +00002374PyObject *
2375PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002376 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002377 const char *errors,
2378 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002379{
Walter Dörwald69652032004-09-07 20:24:22 +00002380 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2381}
2382
2383PyObject *
2384PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002385 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002386 const char *errors,
2387 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002388 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002389{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002390 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002391 Py_ssize_t startinpos;
2392 Py_ssize_t endinpos;
2393 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394 PyUnicodeObject *unicode;
2395 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002396 const unsigned char *q, *e;
2397 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002398 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002399 /* Offsets from q for retrieving byte pairs in the right order. */
2400#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2401 int ihi = 1, ilo = 0;
2402#else
2403 int ihi = 0, ilo = 1;
2404#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002405 PyObject *errorHandler = NULL;
2406 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002407
2408 /* Note: size will always be longer than the resulting Unicode
2409 character count */
2410 unicode = _PyUnicode_New(size);
2411 if (!unicode)
2412 return NULL;
2413 if (size == 0)
2414 return (PyObject *)unicode;
2415
2416 /* Unpack UTF-16 encoded data */
2417 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002418 q = (unsigned char *)s;
2419 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002420
2421 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002422 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002424 /* Check for BOM marks (U+FEFF) in the input and adjust current
2425 byte order setting accordingly. In native mode, the leading BOM
2426 mark is skipped, in all other modes, it is copied to the output
2427 stream as-is (giving a ZWNBSP character). */
2428 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002429 if (size >= 2) {
2430 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002431#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002432 if (bom == 0xFEFF) {
2433 q += 2;
2434 bo = -1;
2435 }
2436 else if (bom == 0xFFFE) {
2437 q += 2;
2438 bo = 1;
2439 }
Tim Petersced69f82003-09-16 20:30:58 +00002440#else
Walter Dörwald69652032004-09-07 20:24:22 +00002441 if (bom == 0xFEFF) {
2442 q += 2;
2443 bo = 1;
2444 }
2445 else if (bom == 0xFFFE) {
2446 q += 2;
2447 bo = -1;
2448 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002449#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002450 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452
Tim Peters772747b2001-08-09 22:21:55 +00002453 if (bo == -1) {
2454 /* force LE */
2455 ihi = 1;
2456 ilo = 0;
2457 }
2458 else if (bo == 1) {
2459 /* force BE */
2460 ihi = 0;
2461 ilo = 1;
2462 }
2463
2464 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002465 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002466 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002467 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002468 if (consumed)
2469 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002470 errmsg = "truncated data";
2471 startinpos = ((const char *)q)-starts;
2472 endinpos = ((const char *)e)-starts;
2473 goto utf16Error;
2474 /* The remaining input chars are ignored if the callback
2475 chooses to skip the input */
2476 }
2477 ch = (q[ihi] << 8) | q[ilo];
2478
Tim Peters772747b2001-08-09 22:21:55 +00002479 q += 2;
2480
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481 if (ch < 0xD800 || ch > 0xDFFF) {
2482 *p++ = ch;
2483 continue;
2484 }
2485
2486 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002487 if (q >= e) {
2488 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002489 startinpos = (((const char *)q)-2)-starts;
2490 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002491 goto utf16Error;
2492 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002493 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002494 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2495 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002496 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002497#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002498 *p++ = ch;
2499 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002500#else
2501 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002502#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002503 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002504 }
2505 else {
2506 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002507 startinpos = (((const char *)q)-4)-starts;
2508 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002509 goto utf16Error;
2510 }
2511
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002513 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002514 startinpos = (((const char *)q)-2)-starts;
2515 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002516 /* Fall through to report the error */
2517
2518 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002519 outpos = p-PyUnicode_AS_UNICODE(unicode);
2520 if (unicode_decode_call_errorhandler(
2521 errors, &errorHandler,
2522 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002523 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002524 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002525 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 }
2527
2528 if (byteorder)
2529 *byteorder = bo;
2530
Walter Dörwald69652032004-09-07 20:24:22 +00002531 if (consumed)
2532 *consumed = (const char *)q-starts;
2533
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002535 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536 goto onError;
2537
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 Py_XDECREF(errorHandler);
2539 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 return (PyObject *)unicode;
2541
2542onError:
2543 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 Py_XDECREF(errorHandler);
2545 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546 return NULL;
2547}
2548
Tim Peters772747b2001-08-09 22:21:55 +00002549PyObject *
2550PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002551 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002552 const char *errors,
2553 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002555 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002556 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002557#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002558 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002559#else
2560 const int pairs = 0;
2561#endif
Tim Peters772747b2001-08-09 22:21:55 +00002562 /* Offsets from p for storing byte pairs in the right order. */
2563#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2564 int ihi = 1, ilo = 0;
2565#else
2566 int ihi = 0, ilo = 1;
2567#endif
2568
2569#define STORECHAR(CH) \
2570 do { \
2571 p[ihi] = ((CH) >> 8) & 0xff; \
2572 p[ilo] = (CH) & 0xff; \
2573 p += 2; \
2574 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002576#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002577 for (i = pairs = 0; i < size; i++)
2578 if (s[i] >= 0x10000)
2579 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002580#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002581 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002582 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583 if (v == NULL)
2584 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585
Walter Dörwald3cc34522007-05-04 10:48:27 +00002586 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002588 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002589 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002590 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002591
2592 if (byteorder == -1) {
2593 /* force LE */
2594 ihi = 1;
2595 ilo = 0;
2596 }
2597 else if (byteorder == 1) {
2598 /* force BE */
2599 ihi = 0;
2600 ilo = 1;
2601 }
2602
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002603 while (size-- > 0) {
2604 Py_UNICODE ch = *s++;
2605 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002606#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002607 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002608 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2609 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002611#endif
Tim Peters772747b2001-08-09 22:21:55 +00002612 STORECHAR(ch);
2613 if (ch2)
2614 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002615 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002616
2617 done:
2618 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
2619 Py_DECREF(v);
2620 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002621#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622}
2623
2624PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2625{
2626 if (!PyUnicode_Check(unicode)) {
2627 PyErr_BadArgument();
2628 return NULL;
2629 }
2630 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2631 PyUnicode_GET_SIZE(unicode),
2632 NULL,
2633 0);
2634}
2635
2636/* --- Unicode Escape Codec ----------------------------------------------- */
2637
Fredrik Lundh06d12682001-01-24 07:59:11 +00002638static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002639
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002641 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 const char *errors)
2643{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002644 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002645 Py_ssize_t startinpos;
2646 Py_ssize_t endinpos;
2647 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002648 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002650 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002652 char* message;
2653 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002654 PyObject *errorHandler = NULL;
2655 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002656
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657 /* Escaped strings will always be longer than the resulting
2658 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002659 length after conversion to the true value.
2660 (but if the error callback returns a long replacement string
2661 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 v = _PyUnicode_New(size);
2663 if (v == NULL)
2664 goto onError;
2665 if (size == 0)
2666 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002667
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002668 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002670
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 while (s < end) {
2672 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002673 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002675
2676 /* Non-escape characters are interpreted as Unicode ordinals */
2677 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002678 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679 continue;
2680 }
2681
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002682 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 /* \ - Escapes */
2684 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002685 c = *s++;
2686 if (s > end)
2687 c = '\0'; /* Invalid after \ */
2688 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689
2690 /* \x escapes */
2691 case '\n': break;
2692 case '\\': *p++ = '\\'; break;
2693 case '\'': *p++ = '\''; break;
2694 case '\"': *p++ = '\"'; break;
2695 case 'b': *p++ = '\b'; break;
2696 case 'f': *p++ = '\014'; break; /* FF */
2697 case 't': *p++ = '\t'; break;
2698 case 'n': *p++ = '\n'; break;
2699 case 'r': *p++ = '\r'; break;
2700 case 'v': *p++ = '\013'; break; /* VT */
2701 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2702
2703 /* \OOO (octal) escapes */
2704 case '0': case '1': case '2': case '3':
2705 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002706 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002707 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002708 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002709 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002710 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002712 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713 break;
2714
Fredrik Lundhccc74732001-02-18 22:13:49 +00002715 /* hex escapes */
2716 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002718 digits = 2;
2719 message = "truncated \\xXX escape";
2720 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721
Fredrik Lundhccc74732001-02-18 22:13:49 +00002722 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002724 digits = 4;
2725 message = "truncated \\uXXXX escape";
2726 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727
Fredrik Lundhccc74732001-02-18 22:13:49 +00002728 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002729 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002730 digits = 8;
2731 message = "truncated \\UXXXXXXXX escape";
2732 hexescape:
2733 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 outpos = p-PyUnicode_AS_UNICODE(v);
2735 if (s+digits>end) {
2736 endinpos = size;
2737 if (unicode_decode_call_errorhandler(
2738 errors, &errorHandler,
2739 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002740 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 (PyObject **)&v, &outpos, &p))
2742 goto onError;
2743 goto nextByte;
2744 }
2745 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002746 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002747 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 endinpos = (s+i+1)-starts;
2749 if (unicode_decode_call_errorhandler(
2750 errors, &errorHandler,
2751 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002752 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002753 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002754 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002755 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002756 }
2757 chr = (chr<<4) & ~0xF;
2758 if (c >= '0' && c <= '9')
2759 chr += c - '0';
2760 else if (c >= 'a' && c <= 'f')
2761 chr += 10 + c - 'a';
2762 else
2763 chr += 10 + c - 'A';
2764 }
2765 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002766 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002767 /* _decoding_error will have already written into the
2768 target buffer. */
2769 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002770 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002771 /* when we get here, chr is a 32-bit unicode character */
2772 if (chr <= 0xffff)
2773 /* UCS-2 character */
2774 *p++ = (Py_UNICODE) chr;
2775 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002776 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002777 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002778#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002779 *p++ = chr;
2780#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002781 chr -= 0x10000L;
2782 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002783 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002784#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002785 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002786 endinpos = s-starts;
2787 outpos = p-PyUnicode_AS_UNICODE(v);
2788 if (unicode_decode_call_errorhandler(
2789 errors, &errorHandler,
2790 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002791 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002792 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002793 goto onError;
2794 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002795 break;
2796
2797 /* \N{name} */
2798 case 'N':
2799 message = "malformed \\N character escape";
2800 if (ucnhash_CAPI == NULL) {
2801 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002802 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002803 m = PyImport_ImportModule("unicodedata");
2804 if (m == NULL)
2805 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002806 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002807 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002808 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002809 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002810 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002811 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002812 if (ucnhash_CAPI == NULL)
2813 goto ucnhashError;
2814 }
2815 if (*s == '{') {
2816 const char *start = s+1;
2817 /* look for the closing brace */
2818 while (*s != '}' && s < end)
2819 s++;
2820 if (s > start && s < end && *s == '}') {
2821 /* found a name. look it up in the unicode database */
2822 message = "unknown Unicode character name";
2823 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002824 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002825 goto store;
2826 }
2827 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002828 endinpos = s-starts;
2829 outpos = p-PyUnicode_AS_UNICODE(v);
2830 if (unicode_decode_call_errorhandler(
2831 errors, &errorHandler,
2832 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002833 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002834 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002835 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002836 break;
2837
2838 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002839 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002840 message = "\\ at end of string";
2841 s--;
2842 endinpos = s-starts;
2843 outpos = p-PyUnicode_AS_UNICODE(v);
2844 if (unicode_decode_call_errorhandler(
2845 errors, &errorHandler,
2846 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002847 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002848 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002849 goto onError;
2850 }
2851 else {
2852 *p++ = '\\';
2853 *p++ = (unsigned char)s[-1];
2854 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002855 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 nextByte:
2858 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002860 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002862 Py_XDECREF(errorHandler);
2863 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002865
Fredrik Lundhccc74732001-02-18 22:13:49 +00002866ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002867 PyErr_SetString(
2868 PyExc_UnicodeError,
2869 "\\N escapes not supported (can't load unicodedata module)"
2870 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002871 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002872 Py_XDECREF(errorHandler);
2873 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002874 return NULL;
2875
Fredrik Lundhccc74732001-02-18 22:13:49 +00002876onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002878 Py_XDECREF(errorHandler);
2879 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880 return NULL;
2881}
2882
2883/* Return a Unicode-Escape string version of the Unicode object.
2884
2885 If quotes is true, the string is enclosed in u"" or u'' quotes as
2886 appropriate.
2887
2888*/
2889
Thomas Wouters477c8d52006-05-27 19:21:47 +00002890Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2891 Py_ssize_t size,
2892 Py_UNICODE ch)
2893{
2894 /* like wcschr, but doesn't stop at NULL characters */
2895
2896 while (size-- > 0) {
2897 if (*s == ch)
2898 return s;
2899 s++;
2900 }
2901
2902 return NULL;
2903}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002904
Walter Dörwald79e913e2007-05-12 11:08:06 +00002905static const char *hexdigits = "0123456789abcdef";
2906
2907PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2908 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002910 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912
Thomas Wouters89f507f2006-12-13 04:49:30 +00002913 /* XXX(nnorwitz): rather than over-allocating, it would be
2914 better to choose a different scheme. Perhaps scan the
2915 first N-chars of the string and allocate based on that size.
2916 */
2917 /* Initial allocation is based on the longest-possible unichr
2918 escape.
2919
2920 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2921 unichr, so in this case it's the longest unichr escape. In
2922 narrow (UTF-16) builds this is five chars per source unichr
2923 since there are two unichrs in the surrogate pair, so in narrow
2924 (UTF-16) builds it's not the longest unichr escape.
2925
2926 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2927 so in the narrow (UTF-16) build case it's the longest unichr
2928 escape.
2929 */
2930
Walter Dörwald79e913e2007-05-12 11:08:06 +00002931 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002932#ifdef Py_UNICODE_WIDE
2933 + 10*size
2934#else
2935 + 6*size
2936#endif
2937 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938 if (repr == NULL)
2939 return NULL;
2940
Walter Dörwald79e913e2007-05-12 11:08:06 +00002941 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943 while (size-- > 0) {
2944 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002945
Walter Dörwald79e913e2007-05-12 11:08:06 +00002946 /* Escape backslashes */
2947 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002948 *p++ = '\\';
2949 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002950 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002951 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002952
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002953#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002954 /* Map 21-bit characters to '\U00xxxxxx' */
2955 else if (ch >= 0x10000) {
2956 *p++ = '\\';
2957 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002958 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2959 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2960 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2961 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2962 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2963 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2964 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2965 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002966 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002967 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002968#else
2969 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002970 else if (ch >= 0xD800 && ch < 0xDC00) {
2971 Py_UNICODE ch2;
2972 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002973
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002974 ch2 = *s++;
2975 size--;
2976 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2977 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2978 *p++ = '\\';
2979 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002980 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2981 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2982 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2983 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2984 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2985 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2986 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2987 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002988 continue;
2989 }
2990 /* Fall through: isolated surrogates are copied as-is */
2991 s--;
2992 size++;
2993 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002994#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002995
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002997 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 *p++ = '\\';
2999 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003000 *p++ = hexdigits[(ch >> 12) & 0x000F];
3001 *p++ = hexdigits[(ch >> 8) & 0x000F];
3002 *p++ = hexdigits[(ch >> 4) & 0x000F];
3003 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003005
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003006 /* Map special whitespace to '\t', \n', '\r' */
3007 else if (ch == '\t') {
3008 *p++ = '\\';
3009 *p++ = 't';
3010 }
3011 else if (ch == '\n') {
3012 *p++ = '\\';
3013 *p++ = 'n';
3014 }
3015 else if (ch == '\r') {
3016 *p++ = '\\';
3017 *p++ = 'r';
3018 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003019
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003020 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003021 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003023 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003024 *p++ = hexdigits[(ch >> 4) & 0x000F];
3025 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003026 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003027
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 /* Copy everything else as-is */
3029 else
3030 *p++ = (char) ch;
3031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032
Guido van Rossum98297ee2007-11-06 21:34:58 +00003033 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr),
3034 p - PyBytes_AS_STRING(repr));
3035 Py_DECREF(repr);
3036 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037}
3038
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3040{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003041 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 if (!PyUnicode_Check(unicode)) {
3043 PyErr_BadArgument();
3044 return NULL;
3045 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003046 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3047 PyUnicode_GET_SIZE(unicode));
3048
3049 if (!s)
3050 return NULL;
3051 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3052 PyBytes_GET_SIZE(s));
3053 Py_DECREF(s);
3054 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055}
3056
3057/* --- Raw Unicode Escape Codec ------------------------------------------- */
3058
3059PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003060 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 const char *errors)
3062{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003063 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003064 Py_ssize_t startinpos;
3065 Py_ssize_t endinpos;
3066 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 const char *end;
3070 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003071 PyObject *errorHandler = NULL;
3072 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003073
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 /* Escaped strings will always be longer than the resulting
3075 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 length after conversion to the true value. (But decoding error
3077 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 v = _PyUnicode_New(size);
3079 if (v == NULL)
3080 goto onError;
3081 if (size == 0)
3082 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003083 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 end = s + size;
3085 while (s < end) {
3086 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003087 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003089 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090
3091 /* Non-escape characters are interpreted as Unicode ordinals */
3092 if (*s != '\\') {
3093 *p++ = (unsigned char)*s++;
3094 continue;
3095 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003096 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097
3098 /* \u-escapes are only interpreted iff the number of leading
3099 backslashes if odd */
3100 bs = s;
3101 for (;s < end;) {
3102 if (*s != '\\')
3103 break;
3104 *p++ = (unsigned char)*s++;
3105 }
3106 if (((s - bs) & 1) == 0 ||
3107 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003108 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 continue;
3110 }
3111 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003112 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 s++;
3114
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003115 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003116 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003117 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003118 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003119 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 endinpos = s-starts;
3121 if (unicode_decode_call_errorhandler(
3122 errors, &errorHandler,
3123 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003124 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003125 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003127 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 }
3129 x = (x<<4) & ~0xF;
3130 if (c >= '0' && c <= '9')
3131 x += c - '0';
3132 else if (c >= 'a' && c <= 'f')
3133 x += 10 + c - 'a';
3134 else
3135 x += 10 + c - 'A';
3136 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003137#ifndef Py_UNICODE_WIDE
3138 if (x > 0x10000) {
3139 if (unicode_decode_call_errorhandler(
3140 errors, &errorHandler,
3141 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003142 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003143 (PyObject **)&v, &outpos, &p))
3144 goto onError;
3145 }
3146#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003147 *p++ = x;
3148 nextByte:
3149 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003151 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003152 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003153 Py_XDECREF(errorHandler);
3154 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003156
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157 onError:
3158 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 Py_XDECREF(errorHandler);
3160 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 return NULL;
3162}
3163
3164PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003165 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003167 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 char *p;
3169 char *q;
3170
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003171#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003172 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003173#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003174 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003175#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 if (repr == NULL)
3177 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003178 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003179 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003180
Walter Dörwald711005d2007-05-12 12:03:26 +00003181 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182 while (size-- > 0) {
3183 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003184#ifdef Py_UNICODE_WIDE
3185 /* Map 32-bit characters to '\Uxxxxxxxx' */
3186 if (ch >= 0x10000) {
3187 *p++ = '\\';
3188 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003189 *p++ = hexdigits[(ch >> 28) & 0xf];
3190 *p++ = hexdigits[(ch >> 24) & 0xf];
3191 *p++ = hexdigits[(ch >> 20) & 0xf];
3192 *p++ = hexdigits[(ch >> 16) & 0xf];
3193 *p++ = hexdigits[(ch >> 12) & 0xf];
3194 *p++ = hexdigits[(ch >> 8) & 0xf];
3195 *p++ = hexdigits[(ch >> 4) & 0xf];
3196 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003197 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003198 else
3199#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 /* Map 16-bit characters to '\uxxxx' */
3201 if (ch >= 256) {
3202 *p++ = '\\';
3203 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003204 *p++ = hexdigits[(ch >> 12) & 0xf];
3205 *p++ = hexdigits[(ch >> 8) & 0xf];
3206 *p++ = hexdigits[(ch >> 4) & 0xf];
3207 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 }
3209 /* Copy everything else as-is */
3210 else
3211 *p++ = (char) ch;
3212 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003213 size = p - q;
3214
3215 done:
3216 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size);
3217 Py_DECREF(repr);
3218 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219}
3220
3221PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3222{
Walter Dörwald711005d2007-05-12 12:03:26 +00003223 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003225 PyErr_BadArgument();
3226 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003228 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3229 PyUnicode_GET_SIZE(unicode));
3230
3231 if (!s)
3232 return NULL;
3233 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3234 PyBytes_GET_SIZE(s));
3235 Py_DECREF(s);
3236 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237}
3238
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003239/* --- Unicode Internal Codec ------------------------------------------- */
3240
3241PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003242 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003243 const char *errors)
3244{
3245 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003246 Py_ssize_t startinpos;
3247 Py_ssize_t endinpos;
3248 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003249 PyUnicodeObject *v;
3250 Py_UNICODE *p;
3251 const char *end;
3252 const char *reason;
3253 PyObject *errorHandler = NULL;
3254 PyObject *exc = NULL;
3255
Neal Norwitzd43069c2006-01-08 01:12:10 +00003256#ifdef Py_UNICODE_WIDE
3257 Py_UNICODE unimax = PyUnicode_GetMax();
3258#endif
3259
Thomas Wouters89f507f2006-12-13 04:49:30 +00003260 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003261 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3262 if (v == NULL)
3263 goto onError;
3264 if (PyUnicode_GetSize((PyObject *)v) == 0)
3265 return (PyObject *)v;
3266 p = PyUnicode_AS_UNICODE(v);
3267 end = s + size;
3268
3269 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003270 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003271 /* We have to sanity check the raw data, otherwise doom looms for
3272 some malformed UCS-4 data. */
3273 if (
3274 #ifdef Py_UNICODE_WIDE
3275 *p > unimax || *p < 0 ||
3276 #endif
3277 end-s < Py_UNICODE_SIZE
3278 )
3279 {
3280 startinpos = s - starts;
3281 if (end-s < Py_UNICODE_SIZE) {
3282 endinpos = end-starts;
3283 reason = "truncated input";
3284 }
3285 else {
3286 endinpos = s - starts + Py_UNICODE_SIZE;
3287 reason = "illegal code point (> 0x10FFFF)";
3288 }
3289 outpos = p - PyUnicode_AS_UNICODE(v);
3290 if (unicode_decode_call_errorhandler(
3291 errors, &errorHandler,
3292 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003293 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003294 (PyObject **)&v, &outpos, &p)) {
3295 goto onError;
3296 }
3297 }
3298 else {
3299 p++;
3300 s += Py_UNICODE_SIZE;
3301 }
3302 }
3303
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003304 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003305 goto onError;
3306 Py_XDECREF(errorHandler);
3307 Py_XDECREF(exc);
3308 return (PyObject *)v;
3309
3310 onError:
3311 Py_XDECREF(v);
3312 Py_XDECREF(errorHandler);
3313 Py_XDECREF(exc);
3314 return NULL;
3315}
3316
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317/* --- Latin-1 Codec ------------------------------------------------------ */
3318
3319PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003320 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321 const char *errors)
3322{
3323 PyUnicodeObject *v;
3324 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003325
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003327 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003328 Py_UNICODE r = *(unsigned char*)s;
3329 return PyUnicode_FromUnicode(&r, 1);
3330 }
3331
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 v = _PyUnicode_New(size);
3333 if (v == NULL)
3334 goto onError;
3335 if (size == 0)
3336 return (PyObject *)v;
3337 p = PyUnicode_AS_UNICODE(v);
3338 while (size-- > 0)
3339 *p++ = (unsigned char)*s++;
3340 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003341
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342 onError:
3343 Py_XDECREF(v);
3344 return NULL;
3345}
3346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003347/* create or adjust a UnicodeEncodeError */
3348static void make_encode_exception(PyObject **exceptionObject,
3349 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003350 const Py_UNICODE *unicode, Py_ssize_t size,
3351 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003352 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003354 if (*exceptionObject == NULL) {
3355 *exceptionObject = PyUnicodeEncodeError_Create(
3356 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 }
3358 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3360 goto onError;
3361 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3362 goto onError;
3363 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3364 goto onError;
3365 return;
3366 onError:
3367 Py_DECREF(*exceptionObject);
3368 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369 }
3370}
3371
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372/* raises a UnicodeEncodeError */
3373static void raise_encode_exception(PyObject **exceptionObject,
3374 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003375 const Py_UNICODE *unicode, Py_ssize_t size,
3376 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003377 const char *reason)
3378{
3379 make_encode_exception(exceptionObject,
3380 encoding, unicode, size, startpos, endpos, reason);
3381 if (*exceptionObject != NULL)
3382 PyCodec_StrictErrors(*exceptionObject);
3383}
3384
3385/* error handling callback helper:
3386 build arguments, call the callback and check the arguments,
3387 put the result into newpos and return the replacement string, which
3388 has to be freed by the caller */
3389static PyObject *unicode_encode_call_errorhandler(const char *errors,
3390 PyObject **errorHandler,
3391 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003392 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3393 Py_ssize_t startpos, Py_ssize_t endpos,
3394 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003395{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003396 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003397
3398 PyObject *restuple;
3399 PyObject *resunicode;
3400
3401 if (*errorHandler == NULL) {
3402 *errorHandler = PyCodec_LookupError(errors);
3403 if (*errorHandler == NULL)
3404 return NULL;
3405 }
3406
3407 make_encode_exception(exceptionObject,
3408 encoding, unicode, size, startpos, endpos, reason);
3409 if (*exceptionObject == NULL)
3410 return NULL;
3411
3412 restuple = PyObject_CallFunctionObjArgs(
3413 *errorHandler, *exceptionObject, NULL);
3414 if (restuple == NULL)
3415 return NULL;
3416 if (!PyTuple_Check(restuple)) {
3417 PyErr_Format(PyExc_TypeError, &argparse[4]);
3418 Py_DECREF(restuple);
3419 return NULL;
3420 }
3421 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3422 &resunicode, newpos)) {
3423 Py_DECREF(restuple);
3424 return NULL;
3425 }
3426 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003427 *newpos = size+*newpos;
3428 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003429 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003430 Py_DECREF(restuple);
3431 return NULL;
3432 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003433 Py_INCREF(resunicode);
3434 Py_DECREF(restuple);
3435 return resunicode;
3436}
3437
3438static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003439 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003440 const char *errors,
3441 int limit)
3442{
3443 /* output object */
3444 PyObject *res;
3445 /* pointers to the beginning and end+1 of input */
3446 const Py_UNICODE *startp = p;
3447 const Py_UNICODE *endp = p + size;
3448 /* pointer to the beginning of the unencodable characters */
3449 /* const Py_UNICODE *badp = NULL; */
3450 /* pointer into the output */
3451 char *str;
3452 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003453 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003454 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3455 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456 PyObject *errorHandler = NULL;
3457 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003458 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003459 /* the following variable is used for caching string comparisons
3460 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3461 int known_errorHandler = -1;
3462
3463 /* allocate enough for a simple encoding without
3464 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003465 if (size == 0)
3466 return PyString_FromStringAndSize(NULL, 0);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003467 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003468 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003469 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003470 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471 ressize = size;
3472
3473 while (p<endp) {
3474 Py_UNICODE c = *p;
3475
3476 /* can we encode this? */
3477 if (c<limit) {
3478 /* no overflow check, because we know that the space is enough */
3479 *str++ = (char)c;
3480 ++p;
3481 }
3482 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003483 Py_ssize_t unicodepos = p-startp;
3484 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003485 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003486 Py_ssize_t repsize;
3487 Py_ssize_t newpos;
3488 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489 Py_UNICODE *uni2;
3490 /* startpos for collecting unencodable chars */
3491 const Py_UNICODE *collstart = p;
3492 const Py_UNICODE *collend = p;
3493 /* find all unecodable characters */
3494 while ((collend < endp) && ((*collend)>=limit))
3495 ++collend;
3496 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3497 if (known_errorHandler==-1) {
3498 if ((errors==NULL) || (!strcmp(errors, "strict")))
3499 known_errorHandler = 1;
3500 else if (!strcmp(errors, "replace"))
3501 known_errorHandler = 2;
3502 else if (!strcmp(errors, "ignore"))
3503 known_errorHandler = 3;
3504 else if (!strcmp(errors, "xmlcharrefreplace"))
3505 known_errorHandler = 4;
3506 else
3507 known_errorHandler = 0;
3508 }
3509 switch (known_errorHandler) {
3510 case 1: /* strict */
3511 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3512 goto onError;
3513 case 2: /* replace */
3514 while (collstart++<collend)
3515 *str++ = '?'; /* fall through */
3516 case 3: /* ignore */
3517 p = collend;
3518 break;
3519 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003520 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 /* determine replacement size (temporarily (mis)uses p) */
3522 for (p = collstart, repsize = 0; p < collend; ++p) {
3523 if (*p<10)
3524 repsize += 2+1+1;
3525 else if (*p<100)
3526 repsize += 2+2+1;
3527 else if (*p<1000)
3528 repsize += 2+3+1;
3529 else if (*p<10000)
3530 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003531#ifndef Py_UNICODE_WIDE
3532 else
3533 repsize += 2+5+1;
3534#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 else if (*p<100000)
3536 repsize += 2+5+1;
3537 else if (*p<1000000)
3538 repsize += 2+6+1;
3539 else
3540 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003541#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 }
3543 requiredsize = respos+repsize+(endp-collend);
3544 if (requiredsize > ressize) {
3545 if (requiredsize<2*ressize)
3546 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003547 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003549 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550 ressize = requiredsize;
3551 }
3552 /* generate replacement (temporarily (mis)uses p) */
3553 for (p = collstart; p < collend; ++p) {
3554 str += sprintf(str, "&#%d;", (int)*p);
3555 }
3556 p = collend;
3557 break;
3558 default:
3559 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3560 encoding, reason, startp, size, &exc,
3561 collstart-startp, collend-startp, &newpos);
3562 if (repunicode == NULL)
3563 goto onError;
3564 /* need more space? (at least enough for what we
3565 have+the replacement+the rest of the string, so
3566 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003567 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 repsize = PyUnicode_GET_SIZE(repunicode);
3569 requiredsize = respos+repsize+(endp-collend);
3570 if (requiredsize > ressize) {
3571 if (requiredsize<2*ressize)
3572 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003573 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 Py_DECREF(repunicode);
3575 goto onError;
3576 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003577 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 ressize = requiredsize;
3579 }
3580 /* check if there is anything unencodable in the replacement
3581 and copy it to the output */
3582 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3583 c = *uni2;
3584 if (c >= limit) {
3585 raise_encode_exception(&exc, encoding, startp, size,
3586 unicodepos, unicodepos+1, reason);
3587 Py_DECREF(repunicode);
3588 goto onError;
3589 }
3590 *str = (char)c;
3591 }
3592 p = startp + newpos;
3593 Py_DECREF(repunicode);
3594 }
3595 }
3596 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003597 result = PyString_FromStringAndSize(PyBytes_AS_STRING(res),
3598 str - PyBytes_AS_STRING(res));
3599 onError:
3600 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 Py_XDECREF(errorHandler);
3602 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003603 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604}
3605
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003607 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 const char *errors)
3609{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611}
3612
3613PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3614{
3615 if (!PyUnicode_Check(unicode)) {
3616 PyErr_BadArgument();
3617 return NULL;
3618 }
3619 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3620 PyUnicode_GET_SIZE(unicode),
3621 NULL);
3622}
3623
3624/* --- 7-bit ASCII Codec -------------------------------------------------- */
3625
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003627 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 const char *errors)
3629{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003630 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631 PyUnicodeObject *v;
3632 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003633 Py_ssize_t startinpos;
3634 Py_ssize_t endinpos;
3635 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636 const char *e;
3637 PyObject *errorHandler = NULL;
3638 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003639
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003641 if (size == 1 && *(unsigned char*)s < 128) {
3642 Py_UNICODE r = *(unsigned char*)s;
3643 return PyUnicode_FromUnicode(&r, 1);
3644 }
Tim Petersced69f82003-09-16 20:30:58 +00003645
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 v = _PyUnicode_New(size);
3647 if (v == NULL)
3648 goto onError;
3649 if (size == 0)
3650 return (PyObject *)v;
3651 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003652 e = s + size;
3653 while (s < e) {
3654 register unsigned char c = (unsigned char)*s;
3655 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657 ++s;
3658 }
3659 else {
3660 startinpos = s-starts;
3661 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003662 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663 if (unicode_decode_call_errorhandler(
3664 errors, &errorHandler,
3665 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003666 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003671 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003672 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003673 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674 Py_XDECREF(errorHandler);
3675 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003676 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003677
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 onError:
3679 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 Py_XDECREF(errorHandler);
3681 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 return NULL;
3683}
3684
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003686 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687 const char *errors)
3688{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690}
3691
3692PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3693{
3694 if (!PyUnicode_Check(unicode)) {
3695 PyErr_BadArgument();
3696 return NULL;
3697 }
3698 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3699 PyUnicode_GET_SIZE(unicode),
3700 NULL);
3701}
3702
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003703#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003704
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003705/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003706
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003707#if SIZEOF_INT < SIZEOF_SSIZE_T
3708#define NEED_RETRY
3709#endif
3710
3711/* XXX This code is limited to "true" double-byte encodings, as
3712 a) it assumes an incomplete character consists of a single byte, and
3713 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3714 encodings, see IsDBCSLeadByteEx documentation. */
3715
3716static int is_dbcs_lead_byte(const char *s, int offset)
3717{
3718 const char *curr = s + offset;
3719
3720 if (IsDBCSLeadByte(*curr)) {
3721 const char *prev = CharPrev(s, curr);
3722 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3723 }
3724 return 0;
3725}
3726
3727/*
3728 * Decode MBCS string into unicode object. If 'final' is set, converts
3729 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3730 */
3731static int decode_mbcs(PyUnicodeObject **v,
3732 const char *s, /* MBCS string */
3733 int size, /* sizeof MBCS string */
3734 int final)
3735{
3736 Py_UNICODE *p;
3737 Py_ssize_t n = 0;
3738 int usize = 0;
3739
3740 assert(size >= 0);
3741
3742 /* Skip trailing lead-byte unless 'final' is set */
3743 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3744 --size;
3745
3746 /* First get the size of the result */
3747 if (size > 0) {
3748 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3749 if (usize == 0) {
3750 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3751 return -1;
3752 }
3753 }
3754
3755 if (*v == NULL) {
3756 /* Create unicode object */
3757 *v = _PyUnicode_New(usize);
3758 if (*v == NULL)
3759 return -1;
3760 }
3761 else {
3762 /* Extend unicode object */
3763 n = PyUnicode_GET_SIZE(*v);
3764 if (_PyUnicode_Resize(v, n + usize) < 0)
3765 return -1;
3766 }
3767
3768 /* Do the conversion */
3769 if (size > 0) {
3770 p = PyUnicode_AS_UNICODE(*v) + n;
3771 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3772 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3773 return -1;
3774 }
3775 }
3776
3777 return size;
3778}
3779
3780PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3781 Py_ssize_t size,
3782 const char *errors,
3783 Py_ssize_t *consumed)
3784{
3785 PyUnicodeObject *v = NULL;
3786 int done;
3787
3788 if (consumed)
3789 *consumed = 0;
3790
3791#ifdef NEED_RETRY
3792 retry:
3793 if (size > INT_MAX)
3794 done = decode_mbcs(&v, s, INT_MAX, 0);
3795 else
3796#endif
3797 done = decode_mbcs(&v, s, (int)size, !consumed);
3798
3799 if (done < 0) {
3800 Py_XDECREF(v);
3801 return NULL;
3802 }
3803
3804 if (consumed)
3805 *consumed += done;
3806
3807#ifdef NEED_RETRY
3808 if (size > INT_MAX) {
3809 s += done;
3810 size -= done;
3811 goto retry;
3812 }
3813#endif
3814
3815 return (PyObject *)v;
3816}
3817
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003818PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003819 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003820 const char *errors)
3821{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003822 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3823}
3824
3825/*
3826 * Convert unicode into string object (MBCS).
3827 * Returns 0 if succeed, -1 otherwise.
3828 */
3829static int encode_mbcs(PyObject **repr,
3830 const Py_UNICODE *p, /* unicode */
3831 int size) /* size of unicode */
3832{
3833 int mbcssize = 0;
3834 Py_ssize_t n = 0;
3835
3836 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003837
3838 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003839 if (size > 0) {
3840 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3841 if (mbcssize == 0) {
3842 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3843 return -1;
3844 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003845 }
3846
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003847 if (*repr == NULL) {
3848 /* Create string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003849 *repr = PyString_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003850 if (*repr == NULL)
3851 return -1;
3852 }
3853 else {
3854 /* Extend string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003855 n = PyString_Size(*repr);
3856 if (_PyString_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003857 return -1;
3858 }
3859
3860 /* Do the conversion */
3861 if (size > 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00003862 char *s = PyString_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003863 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3864 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3865 return -1;
3866 }
3867 }
3868
3869 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003870}
3871
3872PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003873 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003874 const char *errors)
3875{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003876 PyObject *repr = NULL;
3877 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003878
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003879#ifdef NEED_RETRY
3880 retry:
3881 if (size > INT_MAX)
3882 ret = encode_mbcs(&repr, p, INT_MAX);
3883 else
3884#endif
3885 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003886
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003887 if (ret < 0) {
3888 Py_XDECREF(repr);
3889 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003890 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003891
3892#ifdef NEED_RETRY
3893 if (size > INT_MAX) {
3894 p += INT_MAX;
3895 size -= INT_MAX;
3896 goto retry;
3897 }
3898#endif
3899
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003900 return repr;
3901}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003902
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003903PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3904{
3905 if (!PyUnicode_Check(unicode)) {
3906 PyErr_BadArgument();
3907 return NULL;
3908 }
3909 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3910 PyUnicode_GET_SIZE(unicode),
3911 NULL);
3912}
3913
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003914#undef NEED_RETRY
3915
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003916#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003917
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918/* --- Character Mapping Codec -------------------------------------------- */
3919
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003921 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922 PyObject *mapping,
3923 const char *errors)
3924{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003926 Py_ssize_t startinpos;
3927 Py_ssize_t endinpos;
3928 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 PyUnicodeObject *v;
3931 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003932 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933 PyObject *errorHandler = NULL;
3934 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003935 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003936 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003937
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938 /* Default to Latin-1 */
3939 if (mapping == NULL)
3940 return PyUnicode_DecodeLatin1(s, size, errors);
3941
3942 v = _PyUnicode_New(size);
3943 if (v == NULL)
3944 goto onError;
3945 if (size == 0)
3946 return (PyObject *)v;
3947 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003949 if (PyUnicode_CheckExact(mapping)) {
3950 mapstring = PyUnicode_AS_UNICODE(mapping);
3951 maplen = PyUnicode_GET_SIZE(mapping);
3952 while (s < e) {
3953 unsigned char ch = *s;
3954 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003956 if (ch < maplen)
3957 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003959 if (x == 0xfffe) {
3960 /* undefined mapping */
3961 outpos = p-PyUnicode_AS_UNICODE(v);
3962 startinpos = s-starts;
3963 endinpos = startinpos+1;
3964 if (unicode_decode_call_errorhandler(
3965 errors, &errorHandler,
3966 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003967 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003968 (PyObject **)&v, &outpos, &p)) {
3969 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003970 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003971 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003972 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003973 *p++ = x;
3974 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003976 }
3977 else {
3978 while (s < e) {
3979 unsigned char ch = *s;
3980 PyObject *w, *x;
3981
3982 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3983 w = PyInt_FromLong((long)ch);
3984 if (w == NULL)
3985 goto onError;
3986 x = PyObject_GetItem(mapping, w);
3987 Py_DECREF(w);
3988 if (x == NULL) {
3989 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3990 /* No mapping found means: mapping is undefined. */
3991 PyErr_Clear();
3992 x = Py_None;
3993 Py_INCREF(x);
3994 } else
3995 goto onError;
3996 }
3997
3998 /* Apply mapping */
3999 if (PyInt_Check(x)) {
4000 long value = PyInt_AS_LONG(x);
4001 if (value < 0 || value > 65535) {
4002 PyErr_SetString(PyExc_TypeError,
4003 "character mapping must be in range(65536)");
4004 Py_DECREF(x);
4005 goto onError;
4006 }
4007 *p++ = (Py_UNICODE)value;
4008 }
4009 else if (x == Py_None) {
4010 /* undefined mapping */
4011 outpos = p-PyUnicode_AS_UNICODE(v);
4012 startinpos = s-starts;
4013 endinpos = startinpos+1;
4014 if (unicode_decode_call_errorhandler(
4015 errors, &errorHandler,
4016 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004017 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004018 (PyObject **)&v, &outpos, &p)) {
4019 Py_DECREF(x);
4020 goto onError;
4021 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004022 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004023 continue;
4024 }
4025 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004026 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004027
4028 if (targetsize == 1)
4029 /* 1-1 mapping */
4030 *p++ = *PyUnicode_AS_UNICODE(x);
4031
4032 else if (targetsize > 1) {
4033 /* 1-n mapping */
4034 if (targetsize > extrachars) {
4035 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004036 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4037 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004038 (targetsize << 2);
4039 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004040 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004041 if (_PyUnicode_Resize(&v,
4042 PyUnicode_GET_SIZE(v) + needed) < 0) {
4043 Py_DECREF(x);
4044 goto onError;
4045 }
4046 p = PyUnicode_AS_UNICODE(v) + oldpos;
4047 }
4048 Py_UNICODE_COPY(p,
4049 PyUnicode_AS_UNICODE(x),
4050 targetsize);
4051 p += targetsize;
4052 extrachars -= targetsize;
4053 }
4054 /* 1-0 mapping: skip the character */
4055 }
4056 else {
4057 /* wrong return value */
4058 PyErr_SetString(PyExc_TypeError,
4059 "character mapping must return integer, None or unicode");
4060 Py_DECREF(x);
4061 goto onError;
4062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004064 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 }
4067 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004068 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070 Py_XDECREF(errorHandler);
4071 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004073
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004075 Py_XDECREF(errorHandler);
4076 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 Py_XDECREF(v);
4078 return NULL;
4079}
4080
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004081/* Charmap encoding: the lookup table */
4082
4083struct encoding_map{
4084 PyObject_HEAD
4085 unsigned char level1[32];
4086 int count2, count3;
4087 unsigned char level23[1];
4088};
4089
4090static PyObject*
4091encoding_map_size(PyObject *obj, PyObject* args)
4092{
4093 struct encoding_map *map = (struct encoding_map*)obj;
4094 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4095 128*map->count3);
4096}
4097
4098static PyMethodDef encoding_map_methods[] = {
4099 {"size", encoding_map_size, METH_NOARGS,
4100 PyDoc_STR("Return the size (in bytes) of this object") },
4101 { 0 }
4102};
4103
4104static void
4105encoding_map_dealloc(PyObject* o)
4106{
4107 PyObject_FREE(o);
4108}
4109
4110static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004111 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004112 "EncodingMap", /*tp_name*/
4113 sizeof(struct encoding_map), /*tp_basicsize*/
4114 0, /*tp_itemsize*/
4115 /* methods */
4116 encoding_map_dealloc, /*tp_dealloc*/
4117 0, /*tp_print*/
4118 0, /*tp_getattr*/
4119 0, /*tp_setattr*/
4120 0, /*tp_compare*/
4121 0, /*tp_repr*/
4122 0, /*tp_as_number*/
4123 0, /*tp_as_sequence*/
4124 0, /*tp_as_mapping*/
4125 0, /*tp_hash*/
4126 0, /*tp_call*/
4127 0, /*tp_str*/
4128 0, /*tp_getattro*/
4129 0, /*tp_setattro*/
4130 0, /*tp_as_buffer*/
4131 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4132 0, /*tp_doc*/
4133 0, /*tp_traverse*/
4134 0, /*tp_clear*/
4135 0, /*tp_richcompare*/
4136 0, /*tp_weaklistoffset*/
4137 0, /*tp_iter*/
4138 0, /*tp_iternext*/
4139 encoding_map_methods, /*tp_methods*/
4140 0, /*tp_members*/
4141 0, /*tp_getset*/
4142 0, /*tp_base*/
4143 0, /*tp_dict*/
4144 0, /*tp_descr_get*/
4145 0, /*tp_descr_set*/
4146 0, /*tp_dictoffset*/
4147 0, /*tp_init*/
4148 0, /*tp_alloc*/
4149 0, /*tp_new*/
4150 0, /*tp_free*/
4151 0, /*tp_is_gc*/
4152};
4153
4154PyObject*
4155PyUnicode_BuildEncodingMap(PyObject* string)
4156{
4157 Py_UNICODE *decode;
4158 PyObject *result;
4159 struct encoding_map *mresult;
4160 int i;
4161 int need_dict = 0;
4162 unsigned char level1[32];
4163 unsigned char level2[512];
4164 unsigned char *mlevel1, *mlevel2, *mlevel3;
4165 int count2 = 0, count3 = 0;
4166
4167 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4168 PyErr_BadArgument();
4169 return NULL;
4170 }
4171 decode = PyUnicode_AS_UNICODE(string);
4172 memset(level1, 0xFF, sizeof level1);
4173 memset(level2, 0xFF, sizeof level2);
4174
4175 /* If there isn't a one-to-one mapping of NULL to \0,
4176 or if there are non-BMP characters, we need to use
4177 a mapping dictionary. */
4178 if (decode[0] != 0)
4179 need_dict = 1;
4180 for (i = 1; i < 256; i++) {
4181 int l1, l2;
4182 if (decode[i] == 0
4183 #ifdef Py_UNICODE_WIDE
4184 || decode[i] > 0xFFFF
4185 #endif
4186 ) {
4187 need_dict = 1;
4188 break;
4189 }
4190 if (decode[i] == 0xFFFE)
4191 /* unmapped character */
4192 continue;
4193 l1 = decode[i] >> 11;
4194 l2 = decode[i] >> 7;
4195 if (level1[l1] == 0xFF)
4196 level1[l1] = count2++;
4197 if (level2[l2] == 0xFF)
4198 level2[l2] = count3++;
4199 }
4200
4201 if (count2 >= 0xFF || count3 >= 0xFF)
4202 need_dict = 1;
4203
4204 if (need_dict) {
4205 PyObject *result = PyDict_New();
4206 PyObject *key, *value;
4207 if (!result)
4208 return NULL;
4209 for (i = 0; i < 256; i++) {
4210 key = value = NULL;
4211 key = PyInt_FromLong(decode[i]);
4212 value = PyInt_FromLong(i);
4213 if (!key || !value)
4214 goto failed1;
4215 if (PyDict_SetItem(result, key, value) == -1)
4216 goto failed1;
4217 Py_DECREF(key);
4218 Py_DECREF(value);
4219 }
4220 return result;
4221 failed1:
4222 Py_XDECREF(key);
4223 Py_XDECREF(value);
4224 Py_DECREF(result);
4225 return NULL;
4226 }
4227
4228 /* Create a three-level trie */
4229 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4230 16*count2 + 128*count3 - 1);
4231 if (!result)
4232 return PyErr_NoMemory();
4233 PyObject_Init(result, &EncodingMapType);
4234 mresult = (struct encoding_map*)result;
4235 mresult->count2 = count2;
4236 mresult->count3 = count3;
4237 mlevel1 = mresult->level1;
4238 mlevel2 = mresult->level23;
4239 mlevel3 = mresult->level23 + 16*count2;
4240 memcpy(mlevel1, level1, 32);
4241 memset(mlevel2, 0xFF, 16*count2);
4242 memset(mlevel3, 0, 128*count3);
4243 count3 = 0;
4244 for (i = 1; i < 256; i++) {
4245 int o1, o2, o3, i2, i3;
4246 if (decode[i] == 0xFFFE)
4247 /* unmapped character */
4248 continue;
4249 o1 = decode[i]>>11;
4250 o2 = (decode[i]>>7) & 0xF;
4251 i2 = 16*mlevel1[o1] + o2;
4252 if (mlevel2[i2] == 0xFF)
4253 mlevel2[i2] = count3++;
4254 o3 = decode[i] & 0x7F;
4255 i3 = 128*mlevel2[i2] + o3;
4256 mlevel3[i3] = i;
4257 }
4258 return result;
4259}
4260
4261static int
4262encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4263{
4264 struct encoding_map *map = (struct encoding_map*)mapping;
4265 int l1 = c>>11;
4266 int l2 = (c>>7) & 0xF;
4267 int l3 = c & 0x7F;
4268 int i;
4269
4270#ifdef Py_UNICODE_WIDE
4271 if (c > 0xFFFF) {
4272 return -1;
4273 }
4274#endif
4275 if (c == 0)
4276 return 0;
4277 /* level 1*/
4278 i = map->level1[l1];
4279 if (i == 0xFF) {
4280 return -1;
4281 }
4282 /* level 2*/
4283 i = map->level23[16*i+l2];
4284 if (i == 0xFF) {
4285 return -1;
4286 }
4287 /* level 3 */
4288 i = map->level23[16*map->count2 + 128*i + l3];
4289 if (i == 0) {
4290 return -1;
4291 }
4292 return i;
4293}
4294
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295/* Lookup the character ch in the mapping. If the character
4296 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004297 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300 PyObject *w = PyInt_FromLong((long)c);
4301 PyObject *x;
4302
4303 if (w == NULL)
4304 return NULL;
4305 x = PyObject_GetItem(mapping, w);
4306 Py_DECREF(w);
4307 if (x == NULL) {
4308 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4309 /* No mapping found means: mapping is undefined. */
4310 PyErr_Clear();
4311 x = Py_None;
4312 Py_INCREF(x);
4313 return x;
4314 } else
4315 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004317 else if (x == Py_None)
4318 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 else if (PyInt_Check(x)) {
4320 long value = PyInt_AS_LONG(x);
4321 if (value < 0 || value > 255) {
4322 PyErr_SetString(PyExc_TypeError,
4323 "character mapping must be in range(256)");
4324 Py_DECREF(x);
4325 return NULL;
4326 }
4327 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004329 else if (PyString_Check(x))
4330 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004333 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004334 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004335 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 Py_DECREF(x);
4337 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338 }
4339}
4340
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004341static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004342charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004343{
Guido van Rossum98297ee2007-11-06 21:34:58 +00004344 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004345 /* exponentially overallocate to minimize reallocations */
4346 if (requiredsize < 2*outsize)
4347 requiredsize = 2*outsize;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004348 if (_PyString_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004349 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004350 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004351}
4352
4353typedef enum charmapencode_result {
4354 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4355}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004357 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004358 space is available. Return a new reference to the object that
4359 was put in the output buffer, or Py_None, if the mapping was undefined
4360 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004361 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004362static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004363charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004364 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004365{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004366 PyObject *rep;
4367 char *outstart;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004368 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004370 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004371 int res = encoding_map_lookup(c, mapping);
4372 Py_ssize_t requiredsize = *outpos+1;
4373 if (res == -1)
4374 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004375 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004376 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004377 return enc_EXCEPTION;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004378 outstart = PyString_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004379 outstart[(*outpos)++] = (char)res;
4380 return enc_SUCCESS;
4381 }
4382
4383 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004385 return enc_EXCEPTION;
4386 else if (rep==Py_None) {
4387 Py_DECREF(rep);
4388 return enc_FAILED;
4389 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004390 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004391 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004392 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004393 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004395 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004397 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4399 }
4400 else {
4401 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004402 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4403 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004404 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004405 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004407 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004409 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 memcpy(outstart + *outpos, repchars, repsize);
4411 *outpos += repsize;
4412 }
4413 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004414 Py_DECREF(rep);
4415 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416}
4417
4418/* handle an error in PyUnicode_EncodeCharmap
4419 Return 0 on success, -1 on error */
4420static
4421int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004422 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004424 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004425 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426{
4427 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004428 Py_ssize_t repsize;
4429 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430 Py_UNICODE *uni2;
4431 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004432 Py_ssize_t collstartpos = *inpos;
4433 Py_ssize_t collendpos = *inpos+1;
4434 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 char *encoding = "charmap";
4436 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004437 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 /* find all unencodable characters */
4440 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004441 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004442 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004443 int res = encoding_map_lookup(p[collendpos], mapping);
4444 if (res != -1)
4445 break;
4446 ++collendpos;
4447 continue;
4448 }
4449
4450 rep = charmapencode_lookup(p[collendpos], mapping);
4451 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004453 else if (rep!=Py_None) {
4454 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 break;
4456 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004457 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 ++collendpos;
4459 }
4460 /* cache callback name lookup
4461 * (if not done yet, i.e. it's the first error) */
4462 if (*known_errorHandler==-1) {
4463 if ((errors==NULL) || (!strcmp(errors, "strict")))
4464 *known_errorHandler = 1;
4465 else if (!strcmp(errors, "replace"))
4466 *known_errorHandler = 2;
4467 else if (!strcmp(errors, "ignore"))
4468 *known_errorHandler = 3;
4469 else if (!strcmp(errors, "xmlcharrefreplace"))
4470 *known_errorHandler = 4;
4471 else
4472 *known_errorHandler = 0;
4473 }
4474 switch (*known_errorHandler) {
4475 case 1: /* strict */
4476 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4477 return -1;
4478 case 2: /* replace */
4479 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4480 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004481 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482 return -1;
4483 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004484 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4486 return -1;
4487 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488 }
4489 /* fall through */
4490 case 3: /* ignore */
4491 *inpos = collendpos;
4492 break;
4493 case 4: /* xmlcharrefreplace */
4494 /* generate replacement (temporarily (mis)uses p) */
4495 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4496 char buffer[2+29+1+1];
4497 char *cp;
4498 sprintf(buffer, "&#%d;", (int)p[collpos]);
4499 for (cp = buffer; *cp; ++cp) {
4500 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004501 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004502 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004503 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4505 return -1;
4506 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507 }
4508 }
4509 *inpos = collendpos;
4510 break;
4511 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004512 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513 encoding, reason, p, size, exceptionObject,
4514 collstartpos, collendpos, &newpos);
4515 if (repunicode == NULL)
4516 return -1;
4517 /* generate replacement */
4518 repsize = PyUnicode_GET_SIZE(repunicode);
4519 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4520 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004521 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 return -1;
4523 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004524 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4527 return -1;
4528 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 }
4530 *inpos = newpos;
4531 Py_DECREF(repunicode);
4532 }
4533 return 0;
4534}
4535
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004537 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538 PyObject *mapping,
4539 const char *errors)
4540{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 /* output object */
4542 PyObject *res = NULL;
4543 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004544 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004546 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547 PyObject *errorHandler = NULL;
4548 PyObject *exc = NULL;
4549 /* the following variable is used for caching string comparisons
4550 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4551 * 3=ignore, 4=xmlcharrefreplace */
4552 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553
4554 /* Default to Latin-1 */
4555 if (mapping == NULL)
4556 return PyUnicode_EncodeLatin1(p, size, errors);
4557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 /* allocate enough for a simple encoding without
4559 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004560 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 if (res == NULL)
4562 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004563 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 while (inpos<size) {
4567 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004568 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004569 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004571 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 if (charmap_encoding_error(p, size, &inpos, mapping,
4573 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004574 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004575 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004576 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579 else
4580 /* done with this character => adjust input position */
4581 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584 /* Resize if we allocated to much */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004585 if (respos<PyString_GET_SIZE(res))
4586 _PyString_Resize(&res, respos);
4587
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 Py_XDECREF(exc);
4589 Py_XDECREF(errorHandler);
4590 return res;
4591
4592 onError:
4593 Py_XDECREF(res);
4594 Py_XDECREF(exc);
4595 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 return NULL;
4597}
4598
4599PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4600 PyObject *mapping)
4601{
4602 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4603 PyErr_BadArgument();
4604 return NULL;
4605 }
4606 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4607 PyUnicode_GET_SIZE(unicode),
4608 mapping,
4609 NULL);
4610}
4611
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612/* create or adjust a UnicodeTranslateError */
4613static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004614 const Py_UNICODE *unicode, Py_ssize_t size,
4615 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 if (*exceptionObject == NULL) {
4619 *exceptionObject = PyUnicodeTranslateError_Create(
4620 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621 }
4622 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4624 goto onError;
4625 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4626 goto onError;
4627 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4628 goto onError;
4629 return;
4630 onError:
4631 Py_DECREF(*exceptionObject);
4632 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633 }
4634}
4635
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636/* raises a UnicodeTranslateError */
4637static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004638 const Py_UNICODE *unicode, Py_ssize_t size,
4639 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 const char *reason)
4641{
4642 make_translate_exception(exceptionObject,
4643 unicode, size, startpos, endpos, reason);
4644 if (*exceptionObject != NULL)
4645 PyCodec_StrictErrors(*exceptionObject);
4646}
4647
4648/* error handling callback helper:
4649 build arguments, call the callback and check the arguments,
4650 put the result into newpos and return the replacement string, which
4651 has to be freed by the caller */
4652static PyObject *unicode_translate_call_errorhandler(const char *errors,
4653 PyObject **errorHandler,
4654 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004655 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4656 Py_ssize_t startpos, Py_ssize_t endpos,
4657 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004658{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004659 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004660
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004661 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662 PyObject *restuple;
4663 PyObject *resunicode;
4664
4665 if (*errorHandler == NULL) {
4666 *errorHandler = PyCodec_LookupError(errors);
4667 if (*errorHandler == NULL)
4668 return NULL;
4669 }
4670
4671 make_translate_exception(exceptionObject,
4672 unicode, size, startpos, endpos, reason);
4673 if (*exceptionObject == NULL)
4674 return NULL;
4675
4676 restuple = PyObject_CallFunctionObjArgs(
4677 *errorHandler, *exceptionObject, NULL);
4678 if (restuple == NULL)
4679 return NULL;
4680 if (!PyTuple_Check(restuple)) {
4681 PyErr_Format(PyExc_TypeError, &argparse[4]);
4682 Py_DECREF(restuple);
4683 return NULL;
4684 }
4685 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004686 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004687 Py_DECREF(restuple);
4688 return NULL;
4689 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004690 if (i_newpos<0)
4691 *newpos = size+i_newpos;
4692 else
4693 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004694 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004695 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004696 Py_DECREF(restuple);
4697 return NULL;
4698 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 Py_INCREF(resunicode);
4700 Py_DECREF(restuple);
4701 return resunicode;
4702}
4703
4704/* Lookup the character ch in the mapping and put the result in result,
4705 which must be decrefed by the caller.
4706 Return 0 on success, -1 on error */
4707static
4708int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4709{
4710 PyObject *w = PyInt_FromLong((long)c);
4711 PyObject *x;
4712
4713 if (w == NULL)
4714 return -1;
4715 x = PyObject_GetItem(mapping, w);
4716 Py_DECREF(w);
4717 if (x == NULL) {
4718 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4719 /* No mapping found means: use 1:1 mapping. */
4720 PyErr_Clear();
4721 *result = NULL;
4722 return 0;
4723 } else
4724 return -1;
4725 }
4726 else if (x == Py_None) {
4727 *result = x;
4728 return 0;
4729 }
4730 else if (PyInt_Check(x)) {
4731 long value = PyInt_AS_LONG(x);
4732 long max = PyUnicode_GetMax();
4733 if (value < 0 || value > max) {
4734 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004735 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 Py_DECREF(x);
4737 return -1;
4738 }
4739 *result = x;
4740 return 0;
4741 }
4742 else if (PyUnicode_Check(x)) {
4743 *result = x;
4744 return 0;
4745 }
4746 else {
4747 /* wrong return value */
4748 PyErr_SetString(PyExc_TypeError,
4749 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004750 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751 return -1;
4752 }
4753}
4754/* ensure that *outobj is at least requiredsize characters long,
4755if not reallocate and adjust various state variables.
4756Return 0 on success, -1 on error */
4757static
Walter Dörwald4894c302003-10-24 14:25:28 +00004758int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004759 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004760{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004761 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004762 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004763 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004764 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004765 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004766 if (requiredsize < 2 * oldsize)
4767 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004768 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 return -1;
4770 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771 }
4772 return 0;
4773}
4774/* lookup the character, put the result in the output string and adjust
4775 various state variables. Return a new reference to the object that
4776 was put in the output buffer in *result, or Py_None, if the mapping was
4777 undefined (in which case no character was written).
4778 The called must decref result.
4779 Return 0 on success, -1 on error. */
4780static
Walter Dörwald4894c302003-10-24 14:25:28 +00004781int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004782 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004783 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004784{
Walter Dörwald4894c302003-10-24 14:25:28 +00004785 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786 return -1;
4787 if (*res==NULL) {
4788 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004789 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790 }
4791 else if (*res==Py_None)
4792 ;
4793 else if (PyInt_Check(*res)) {
4794 /* no overflow check, because we know that the space is enough */
4795 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4796 }
4797 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004798 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004799 if (repsize==1) {
4800 /* no overflow check, because we know that the space is enough */
4801 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4802 }
4803 else if (repsize!=0) {
4804 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004805 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004806 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004807 repsize - 1;
4808 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809 return -1;
4810 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4811 *outp += repsize;
4812 }
4813 }
4814 else
4815 return -1;
4816 return 0;
4817}
4818
4819PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004820 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821 PyObject *mapping,
4822 const char *errors)
4823{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 /* output object */
4825 PyObject *res = NULL;
4826 /* pointers to the beginning and end+1 of input */
4827 const Py_UNICODE *startp = p;
4828 const Py_UNICODE *endp = p + size;
4829 /* pointer into the output */
4830 Py_UNICODE *str;
4831 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004832 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 char *reason = "character maps to <undefined>";
4834 PyObject *errorHandler = NULL;
4835 PyObject *exc = NULL;
4836 /* the following variable is used for caching string comparisons
4837 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4838 * 3=ignore, 4=xmlcharrefreplace */
4839 int known_errorHandler = -1;
4840
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 if (mapping == NULL) {
4842 PyErr_BadArgument();
4843 return NULL;
4844 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004845
4846 /* allocate enough for a simple 1:1 translation without
4847 replacements, if we need more, we'll resize */
4848 res = PyUnicode_FromUnicode(NULL, size);
4849 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004850 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004852 return res;
4853 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004855 while (p<endp) {
4856 /* try to encode it */
4857 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004858 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004859 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860 goto onError;
4861 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004862 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863 if (x!=Py_None) /* it worked => adjust input pointer */
4864 ++p;
4865 else { /* untranslatable character */
4866 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004867 Py_ssize_t repsize;
4868 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004869 Py_UNICODE *uni2;
4870 /* startpos for collecting untranslatable chars */
4871 const Py_UNICODE *collstart = p;
4872 const Py_UNICODE *collend = p+1;
4873 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875 /* find all untranslatable characters */
4876 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004877 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004878 goto onError;
4879 Py_XDECREF(x);
4880 if (x!=Py_None)
4881 break;
4882 ++collend;
4883 }
4884 /* cache callback name lookup
4885 * (if not done yet, i.e. it's the first error) */
4886 if (known_errorHandler==-1) {
4887 if ((errors==NULL) || (!strcmp(errors, "strict")))
4888 known_errorHandler = 1;
4889 else if (!strcmp(errors, "replace"))
4890 known_errorHandler = 2;
4891 else if (!strcmp(errors, "ignore"))
4892 known_errorHandler = 3;
4893 else if (!strcmp(errors, "xmlcharrefreplace"))
4894 known_errorHandler = 4;
4895 else
4896 known_errorHandler = 0;
4897 }
4898 switch (known_errorHandler) {
4899 case 1: /* strict */
4900 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4901 goto onError;
4902 case 2: /* replace */
4903 /* No need to check for space, this is a 1:1 replacement */
4904 for (coll = collstart; coll<collend; ++coll)
4905 *str++ = '?';
4906 /* fall through */
4907 case 3: /* ignore */
4908 p = collend;
4909 break;
4910 case 4: /* xmlcharrefreplace */
4911 /* generate replacement (temporarily (mis)uses p) */
4912 for (p = collstart; p < collend; ++p) {
4913 char buffer[2+29+1+1];
4914 char *cp;
4915 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004916 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004917 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4918 goto onError;
4919 for (cp = buffer; *cp; ++cp)
4920 *str++ = *cp;
4921 }
4922 p = collend;
4923 break;
4924 default:
4925 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4926 reason, startp, size, &exc,
4927 collstart-startp, collend-startp, &newpos);
4928 if (repunicode == NULL)
4929 goto onError;
4930 /* generate replacement */
4931 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004932 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004933 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4934 Py_DECREF(repunicode);
4935 goto onError;
4936 }
4937 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4938 *str++ = *uni2;
4939 p = startp + newpos;
4940 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 }
4942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004944 /* Resize if we allocated to much */
4945 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004946 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004947 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004948 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949 }
4950 Py_XDECREF(exc);
4951 Py_XDECREF(errorHandler);
4952 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004954 onError:
4955 Py_XDECREF(res);
4956 Py_XDECREF(exc);
4957 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958 return NULL;
4959}
4960
4961PyObject *PyUnicode_Translate(PyObject *str,
4962 PyObject *mapping,
4963 const char *errors)
4964{
4965 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004966
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967 str = PyUnicode_FromObject(str);
4968 if (str == NULL)
4969 goto onError;
4970 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4971 PyUnicode_GET_SIZE(str),
4972 mapping,
4973 errors);
4974 Py_DECREF(str);
4975 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004976
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 onError:
4978 Py_XDECREF(str);
4979 return NULL;
4980}
Tim Petersced69f82003-09-16 20:30:58 +00004981
Guido van Rossum9e896b32000-04-05 20:11:21 +00004982/* --- Decimal Encoder ---------------------------------------------------- */
4983
4984int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004985 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004986 char *output,
4987 const char *errors)
4988{
4989 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004990 PyObject *errorHandler = NULL;
4991 PyObject *exc = NULL;
4992 const char *encoding = "decimal";
4993 const char *reason = "invalid decimal Unicode string";
4994 /* the following variable is used for caching string comparisons
4995 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4996 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004997
4998 if (output == NULL) {
4999 PyErr_BadArgument();
5000 return -1;
5001 }
5002
5003 p = s;
5004 end = s + length;
5005 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005006 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005007 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005008 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005009 Py_ssize_t repsize;
5010 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005011 Py_UNICODE *uni2;
5012 Py_UNICODE *collstart;
5013 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005014
Guido van Rossum9e896b32000-04-05 20:11:21 +00005015 if (Py_UNICODE_ISSPACE(ch)) {
5016 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005017 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005018 continue;
5019 }
5020 decimal = Py_UNICODE_TODECIMAL(ch);
5021 if (decimal >= 0) {
5022 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005023 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005024 continue;
5025 }
Guido van Rossumba477042000-04-06 18:18:10 +00005026 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005027 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005028 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005029 continue;
5030 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031 /* All other characters are considered unencodable */
5032 collstart = p;
5033 collend = p+1;
5034 while (collend < end) {
5035 if ((0 < *collend && *collend < 256) ||
5036 !Py_UNICODE_ISSPACE(*collend) ||
5037 Py_UNICODE_TODECIMAL(*collend))
5038 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005039 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005040 /* cache callback name lookup
5041 * (if not done yet, i.e. it's the first error) */
5042 if (known_errorHandler==-1) {
5043 if ((errors==NULL) || (!strcmp(errors, "strict")))
5044 known_errorHandler = 1;
5045 else if (!strcmp(errors, "replace"))
5046 known_errorHandler = 2;
5047 else if (!strcmp(errors, "ignore"))
5048 known_errorHandler = 3;
5049 else if (!strcmp(errors, "xmlcharrefreplace"))
5050 known_errorHandler = 4;
5051 else
5052 known_errorHandler = 0;
5053 }
5054 switch (known_errorHandler) {
5055 case 1: /* strict */
5056 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5057 goto onError;
5058 case 2: /* replace */
5059 for (p = collstart; p < collend; ++p)
5060 *output++ = '?';
5061 /* fall through */
5062 case 3: /* ignore */
5063 p = collend;
5064 break;
5065 case 4: /* xmlcharrefreplace */
5066 /* generate replacement (temporarily (mis)uses p) */
5067 for (p = collstart; p < collend; ++p)
5068 output += sprintf(output, "&#%d;", (int)*p);
5069 p = collend;
5070 break;
5071 default:
5072 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5073 encoding, reason, s, length, &exc,
5074 collstart-s, collend-s, &newpos);
5075 if (repunicode == NULL)
5076 goto onError;
5077 /* generate replacement */
5078 repsize = PyUnicode_GET_SIZE(repunicode);
5079 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5080 Py_UNICODE ch = *uni2;
5081 if (Py_UNICODE_ISSPACE(ch))
5082 *output++ = ' ';
5083 else {
5084 decimal = Py_UNICODE_TODECIMAL(ch);
5085 if (decimal >= 0)
5086 *output++ = '0' + decimal;
5087 else if (0 < ch && ch < 256)
5088 *output++ = (char)ch;
5089 else {
5090 Py_DECREF(repunicode);
5091 raise_encode_exception(&exc, encoding,
5092 s, length, collstart-s, collend-s, reason);
5093 goto onError;
5094 }
5095 }
5096 }
5097 p = s + newpos;
5098 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005099 }
5100 }
5101 /* 0-terminate the output string */
5102 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005103 Py_XDECREF(exc);
5104 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005105 return 0;
5106
5107 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005108 Py_XDECREF(exc);
5109 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005110 return -1;
5111}
5112
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113/* --- Helpers ------------------------------------------------------------ */
5114
Eric Smith8c663262007-08-25 02:26:07 +00005115#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005116#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005117#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005118/* Include _ParseTupleFinds from find.h */
5119#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005120#include "stringlib/find.h"
5121#include "stringlib/partition.h"
5122
5123/* helper macro to fixup start/end slice values */
5124#define FIX_START_END(obj) \
5125 if (start < 0) \
5126 start += (obj)->length; \
5127 if (start < 0) \
5128 start = 0; \
5129 if (end > (obj)->length) \
5130 end = (obj)->length; \
5131 if (end < 0) \
5132 end += (obj)->length; \
5133 if (end < 0) \
5134 end = 0;
5135
Martin v. Löwis18e16552006-02-15 17:27:45 +00005136Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005137 PyObject *substr,
5138 Py_ssize_t start,
5139 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005141 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005142 PyUnicodeObject* str_obj;
5143 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005144
Thomas Wouters477c8d52006-05-27 19:21:47 +00005145 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5146 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005148 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5149 if (!sub_obj) {
5150 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 return -1;
5152 }
Tim Petersced69f82003-09-16 20:30:58 +00005153
Thomas Wouters477c8d52006-05-27 19:21:47 +00005154 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005155
Thomas Wouters477c8d52006-05-27 19:21:47 +00005156 result = stringlib_count(
5157 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5158 );
5159
5160 Py_DECREF(sub_obj);
5161 Py_DECREF(str_obj);
5162
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163 return result;
5164}
5165
Martin v. Löwis18e16552006-02-15 17:27:45 +00005166Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005167 PyObject *sub,
5168 Py_ssize_t start,
5169 Py_ssize_t end,
5170 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005172 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005173
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005175 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005176 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005177 sub = PyUnicode_FromObject(sub);
5178 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005179 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005180 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 }
Tim Petersced69f82003-09-16 20:30:58 +00005182
Thomas Wouters477c8d52006-05-27 19:21:47 +00005183 if (direction > 0)
5184 result = stringlib_find_slice(
5185 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5186 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5187 start, end
5188 );
5189 else
5190 result = stringlib_rfind_slice(
5191 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5192 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5193 start, end
5194 );
5195
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005197 Py_DECREF(sub);
5198
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 return result;
5200}
5201
Tim Petersced69f82003-09-16 20:30:58 +00005202static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203int tailmatch(PyUnicodeObject *self,
5204 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005205 Py_ssize_t start,
5206 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 int direction)
5208{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 if (substring->length == 0)
5210 return 1;
5211
Thomas Wouters477c8d52006-05-27 19:21:47 +00005212 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213
5214 end -= substring->length;
5215 if (end < start)
5216 return 0;
5217
5218 if (direction > 0) {
5219 if (Py_UNICODE_MATCH(self, end, substring))
5220 return 1;
5221 } else {
5222 if (Py_UNICODE_MATCH(self, start, substring))
5223 return 1;
5224 }
5225
5226 return 0;
5227}
5228
Martin v. Löwis18e16552006-02-15 17:27:45 +00005229Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005231 Py_ssize_t start,
5232 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 int direction)
5234{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005236
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 str = PyUnicode_FromObject(str);
5238 if (str == NULL)
5239 return -1;
5240 substr = PyUnicode_FromObject(substr);
5241 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005242 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 return -1;
5244 }
Tim Petersced69f82003-09-16 20:30:58 +00005245
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 result = tailmatch((PyUnicodeObject *)str,
5247 (PyUnicodeObject *)substr,
5248 start, end, direction);
5249 Py_DECREF(str);
5250 Py_DECREF(substr);
5251 return result;
5252}
5253
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254/* Apply fixfct filter to the Unicode object self and return a
5255 reference to the modified object */
5256
Tim Petersced69f82003-09-16 20:30:58 +00005257static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258PyObject *fixup(PyUnicodeObject *self,
5259 int (*fixfct)(PyUnicodeObject *s))
5260{
5261
5262 PyUnicodeObject *u;
5263
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005264 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 if (u == NULL)
5266 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005267
5268 Py_UNICODE_COPY(u->str, self->str, self->length);
5269
Tim Peters7a29bd52001-09-12 03:03:31 +00005270 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 /* fixfct should return TRUE if it modified the buffer. If
5272 FALSE, return a reference to the original buffer instead
5273 (to save space, not time) */
5274 Py_INCREF(self);
5275 Py_DECREF(u);
5276 return (PyObject*) self;
5277 }
5278 return (PyObject*) u;
5279}
5280
Tim Petersced69f82003-09-16 20:30:58 +00005281static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282int fixupper(PyUnicodeObject *self)
5283{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005284 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 Py_UNICODE *s = self->str;
5286 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005287
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 while (len-- > 0) {
5289 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005290
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 ch = Py_UNICODE_TOUPPER(*s);
5292 if (ch != *s) {
5293 status = 1;
5294 *s = ch;
5295 }
5296 s++;
5297 }
5298
5299 return status;
5300}
5301
Tim Petersced69f82003-09-16 20:30:58 +00005302static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303int fixlower(PyUnicodeObject *self)
5304{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005305 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 Py_UNICODE *s = self->str;
5307 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005308
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 while (len-- > 0) {
5310 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005311
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312 ch = Py_UNICODE_TOLOWER(*s);
5313 if (ch != *s) {
5314 status = 1;
5315 *s = ch;
5316 }
5317 s++;
5318 }
5319
5320 return status;
5321}
5322
Tim Petersced69f82003-09-16 20:30:58 +00005323static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324int fixswapcase(PyUnicodeObject *self)
5325{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005326 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 Py_UNICODE *s = self->str;
5328 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005329
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 while (len-- > 0) {
5331 if (Py_UNICODE_ISUPPER(*s)) {
5332 *s = Py_UNICODE_TOLOWER(*s);
5333 status = 1;
5334 } else if (Py_UNICODE_ISLOWER(*s)) {
5335 *s = Py_UNICODE_TOUPPER(*s);
5336 status = 1;
5337 }
5338 s++;
5339 }
5340
5341 return status;
5342}
5343
Tim Petersced69f82003-09-16 20:30:58 +00005344static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345int fixcapitalize(PyUnicodeObject *self)
5346{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005347 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005348 Py_UNICODE *s = self->str;
5349 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005350
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005351 if (len == 0)
5352 return 0;
5353 if (Py_UNICODE_ISLOWER(*s)) {
5354 *s = Py_UNICODE_TOUPPER(*s);
5355 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005357 s++;
5358 while (--len > 0) {
5359 if (Py_UNICODE_ISUPPER(*s)) {
5360 *s = Py_UNICODE_TOLOWER(*s);
5361 status = 1;
5362 }
5363 s++;
5364 }
5365 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366}
5367
5368static
5369int fixtitle(PyUnicodeObject *self)
5370{
5371 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5372 register Py_UNICODE *e;
5373 int previous_is_cased;
5374
5375 /* Shortcut for single character strings */
5376 if (PyUnicode_GET_SIZE(self) == 1) {
5377 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5378 if (*p != ch) {
5379 *p = ch;
5380 return 1;
5381 }
5382 else
5383 return 0;
5384 }
Tim Petersced69f82003-09-16 20:30:58 +00005385
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 e = p + PyUnicode_GET_SIZE(self);
5387 previous_is_cased = 0;
5388 for (; p < e; p++) {
5389 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005390
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 if (previous_is_cased)
5392 *p = Py_UNICODE_TOLOWER(ch);
5393 else
5394 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005395
5396 if (Py_UNICODE_ISLOWER(ch) ||
5397 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398 Py_UNICODE_ISTITLE(ch))
5399 previous_is_cased = 1;
5400 else
5401 previous_is_cased = 0;
5402 }
5403 return 1;
5404}
5405
Tim Peters8ce9f162004-08-27 01:49:32 +00005406PyObject *
5407PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408{
Tim Peters8ce9f162004-08-27 01:49:32 +00005409 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005410 const Py_UNICODE blank = ' ';
5411 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005412 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005413 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005414 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5415 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005416 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5417 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005418 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005419 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005420 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421
Tim Peters05eba1f2004-08-27 21:32:02 +00005422 fseq = PySequence_Fast(seq, "");
5423 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005424 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005425 }
5426
Tim Peters91879ab2004-08-27 22:35:44 +00005427 /* Grrrr. A codec may be invoked to convert str objects to
5428 * Unicode, and so it's possible to call back into Python code
5429 * during PyUnicode_FromObject(), and so it's possible for a sick
5430 * codec to change the size of fseq (if seq is a list). Therefore
5431 * we have to keep refetching the size -- can't assume seqlen
5432 * is invariant.
5433 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005434 seqlen = PySequence_Fast_GET_SIZE(fseq);
5435 /* If empty sequence, return u"". */
5436 if (seqlen == 0) {
5437 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5438 goto Done;
5439 }
5440 /* If singleton sequence with an exact Unicode, return that. */
5441 if (seqlen == 1) {
5442 item = PySequence_Fast_GET_ITEM(fseq, 0);
5443 if (PyUnicode_CheckExact(item)) {
5444 Py_INCREF(item);
5445 res = (PyUnicodeObject *)item;
5446 goto Done;
5447 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005448 }
5449
Tim Peters05eba1f2004-08-27 21:32:02 +00005450 /* At least two items to join, or one that isn't exact Unicode. */
5451 if (seqlen > 1) {
5452 /* Set up sep and seplen -- they're needed. */
5453 if (separator == NULL) {
5454 sep = &blank;
5455 seplen = 1;
5456 }
5457 else {
5458 internal_separator = PyUnicode_FromObject(separator);
5459 if (internal_separator == NULL)
5460 goto onError;
5461 sep = PyUnicode_AS_UNICODE(internal_separator);
5462 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005463 /* In case PyUnicode_FromObject() mutated seq. */
5464 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005465 }
5466 }
5467
5468 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005469 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005470 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005471 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005472 res_p = PyUnicode_AS_UNICODE(res);
5473 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005474
Tim Peters05eba1f2004-08-27 21:32:02 +00005475 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005476 Py_ssize_t itemlen;
5477 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005478
5479 item = PySequence_Fast_GET_ITEM(fseq, i);
5480 /* Convert item to Unicode. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005481 if (!PyUnicode_Check(item)) {
5482 PyErr_Format(PyExc_TypeError,
5483 "sequence item %zd: expected str instance,"
5484 " %.80s found",
5485 i, Py_Type(item)->tp_name);
5486 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005487 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005488 item = PyUnicode_FromObject(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005489 if (item == NULL)
5490 goto onError;
5491 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005492
Tim Peters91879ab2004-08-27 22:35:44 +00005493 /* In case PyUnicode_FromObject() mutated seq. */
5494 seqlen = PySequence_Fast_GET_SIZE(fseq);
5495
Tim Peters8ce9f162004-08-27 01:49:32 +00005496 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005498 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005499 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005500 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005501 if (i < seqlen - 1) {
5502 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005503 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005504 goto Overflow;
5505 }
5506 if (new_res_used > res_alloc) {
5507 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005508 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005509 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005510 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005511 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005512 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005513 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005514 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005516 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005517 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005519
5520 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005521 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005522 res_p += itemlen;
5523 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005524 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005525 res_p += seplen;
5526 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005528 res_used = new_res_used;
5529 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005530
Tim Peters05eba1f2004-08-27 21:32:02 +00005531 /* Shrink res to match the used area; this probably can't fail,
5532 * but it's cheap to check.
5533 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005534 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005535 goto onError;
5536
5537 Done:
5538 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005539 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 return (PyObject *)res;
5541
Tim Peters8ce9f162004-08-27 01:49:32 +00005542 Overflow:
5543 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005544 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005545 Py_DECREF(item);
5546 /* fall through */
5547
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005549 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005550 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005551 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 return NULL;
5553}
5554
Tim Petersced69f82003-09-16 20:30:58 +00005555static
5556PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005557 Py_ssize_t left,
5558 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 Py_UNICODE fill)
5560{
5561 PyUnicodeObject *u;
5562
5563 if (left < 0)
5564 left = 0;
5565 if (right < 0)
5566 right = 0;
5567
Tim Peters7a29bd52001-09-12 03:03:31 +00005568 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 Py_INCREF(self);
5570 return self;
5571 }
5572
5573 u = _PyUnicode_New(left + self->length + right);
5574 if (u) {
5575 if (left)
5576 Py_UNICODE_FILL(u->str, fill, left);
5577 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5578 if (right)
5579 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5580 }
5581
5582 return u;
5583}
5584
5585#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005586 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 if (!str) \
5588 goto onError; \
5589 if (PyList_Append(list, str)) { \
5590 Py_DECREF(str); \
5591 goto onError; \
5592 } \
5593 else \
5594 Py_DECREF(str);
5595
5596static
5597PyObject *split_whitespace(PyUnicodeObject *self,
5598 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005599 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005601 register Py_ssize_t i;
5602 register Py_ssize_t j;
5603 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 PyObject *str;
5605
5606 for (i = j = 0; i < len; ) {
5607 /* find a token */
5608 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5609 i++;
5610 j = i;
5611 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5612 i++;
5613 if (j < i) {
5614 if (maxcount-- <= 0)
5615 break;
5616 SPLIT_APPEND(self->str, j, i);
5617 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5618 i++;
5619 j = i;
5620 }
5621 }
5622 if (j < len) {
5623 SPLIT_APPEND(self->str, j, len);
5624 }
5625 return list;
5626
5627 onError:
5628 Py_DECREF(list);
5629 return NULL;
5630}
5631
5632PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005633 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005635 register Py_ssize_t i;
5636 register Py_ssize_t j;
5637 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 PyObject *list;
5639 PyObject *str;
5640 Py_UNICODE *data;
5641
5642 string = PyUnicode_FromObject(string);
5643 if (string == NULL)
5644 return NULL;
5645 data = PyUnicode_AS_UNICODE(string);
5646 len = PyUnicode_GET_SIZE(string);
5647
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 list = PyList_New(0);
5649 if (!list)
5650 goto onError;
5651
5652 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005653 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005654
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005656 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658
5659 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005660 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 if (i < len) {
5662 if (data[i] == '\r' && i + 1 < len &&
5663 data[i+1] == '\n')
5664 i += 2;
5665 else
5666 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005667 if (keepends)
5668 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 }
Guido van Rossum86662912000-04-11 15:38:46 +00005670 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 j = i;
5672 }
5673 if (j < len) {
5674 SPLIT_APPEND(data, j, len);
5675 }
5676
5677 Py_DECREF(string);
5678 return list;
5679
5680 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005681 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 Py_DECREF(string);
5683 return NULL;
5684}
5685
Tim Petersced69f82003-09-16 20:30:58 +00005686static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687PyObject *split_char(PyUnicodeObject *self,
5688 PyObject *list,
5689 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005690 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005692 register Py_ssize_t i;
5693 register Py_ssize_t j;
5694 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 PyObject *str;
5696
5697 for (i = j = 0; i < len; ) {
5698 if (self->str[i] == ch) {
5699 if (maxcount-- <= 0)
5700 break;
5701 SPLIT_APPEND(self->str, j, i);
5702 i = j = i + 1;
5703 } else
5704 i++;
5705 }
5706 if (j <= len) {
5707 SPLIT_APPEND(self->str, j, len);
5708 }
5709 return list;
5710
5711 onError:
5712 Py_DECREF(list);
5713 return NULL;
5714}
5715
Tim Petersced69f82003-09-16 20:30:58 +00005716static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717PyObject *split_substring(PyUnicodeObject *self,
5718 PyObject *list,
5719 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005720 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005722 register Py_ssize_t i;
5723 register Py_ssize_t j;
5724 Py_ssize_t len = self->length;
5725 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 PyObject *str;
5727
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005728 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 if (Py_UNICODE_MATCH(self, i, substring)) {
5730 if (maxcount-- <= 0)
5731 break;
5732 SPLIT_APPEND(self->str, j, i);
5733 i = j = i + sublen;
5734 } else
5735 i++;
5736 }
5737 if (j <= len) {
5738 SPLIT_APPEND(self->str, j, len);
5739 }
5740 return list;
5741
5742 onError:
5743 Py_DECREF(list);
5744 return NULL;
5745}
5746
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005747static
5748PyObject *rsplit_whitespace(PyUnicodeObject *self,
5749 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005750 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005751{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005752 register Py_ssize_t i;
5753 register Py_ssize_t j;
5754 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005755 PyObject *str;
5756
5757 for (i = j = len - 1; i >= 0; ) {
5758 /* find a token */
5759 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5760 i--;
5761 j = i;
5762 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5763 i--;
5764 if (j > i) {
5765 if (maxcount-- <= 0)
5766 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005767 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005768 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5769 i--;
5770 j = i;
5771 }
5772 }
5773 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005774 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005775 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005776 if (PyList_Reverse(list) < 0)
5777 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005778 return list;
5779
5780 onError:
5781 Py_DECREF(list);
5782 return NULL;
5783}
5784
5785static
5786PyObject *rsplit_char(PyUnicodeObject *self,
5787 PyObject *list,
5788 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005789 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005790{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005791 register Py_ssize_t i;
5792 register Py_ssize_t j;
5793 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005794 PyObject *str;
5795
5796 for (i = j = len - 1; i >= 0; ) {
5797 if (self->str[i] == ch) {
5798 if (maxcount-- <= 0)
5799 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005800 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005801 j = i = i - 1;
5802 } else
5803 i--;
5804 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005805 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005806 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005807 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005808 if (PyList_Reverse(list) < 0)
5809 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005810 return list;
5811
5812 onError:
5813 Py_DECREF(list);
5814 return NULL;
5815}
5816
5817static
5818PyObject *rsplit_substring(PyUnicodeObject *self,
5819 PyObject *list,
5820 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005821 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005822{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005823 register Py_ssize_t i;
5824 register Py_ssize_t j;
5825 Py_ssize_t len = self->length;
5826 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005827 PyObject *str;
5828
5829 for (i = len - sublen, j = len; i >= 0; ) {
5830 if (Py_UNICODE_MATCH(self, i, substring)) {
5831 if (maxcount-- <= 0)
5832 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005833 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005834 j = i;
5835 i -= sublen;
5836 } else
5837 i--;
5838 }
5839 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005840 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005841 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005842 if (PyList_Reverse(list) < 0)
5843 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005844 return list;
5845
5846 onError:
5847 Py_DECREF(list);
5848 return NULL;
5849}
5850
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851#undef SPLIT_APPEND
5852
5853static
5854PyObject *split(PyUnicodeObject *self,
5855 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005856 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857{
5858 PyObject *list;
5859
5860 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005861 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862
5863 list = PyList_New(0);
5864 if (!list)
5865 return NULL;
5866
5867 if (substring == NULL)
5868 return split_whitespace(self,list,maxcount);
5869
5870 else if (substring->length == 1)
5871 return split_char(self,list,substring->str[0],maxcount);
5872
5873 else if (substring->length == 0) {
5874 Py_DECREF(list);
5875 PyErr_SetString(PyExc_ValueError, "empty separator");
5876 return NULL;
5877 }
5878 else
5879 return split_substring(self,list,substring,maxcount);
5880}
5881
Tim Petersced69f82003-09-16 20:30:58 +00005882static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005883PyObject *rsplit(PyUnicodeObject *self,
5884 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005885 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005886{
5887 PyObject *list;
5888
5889 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005890 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005891
5892 list = PyList_New(0);
5893 if (!list)
5894 return NULL;
5895
5896 if (substring == NULL)
5897 return rsplit_whitespace(self,list,maxcount);
5898
5899 else if (substring->length == 1)
5900 return rsplit_char(self,list,substring->str[0],maxcount);
5901
5902 else if (substring->length == 0) {
5903 Py_DECREF(list);
5904 PyErr_SetString(PyExc_ValueError, "empty separator");
5905 return NULL;
5906 }
5907 else
5908 return rsplit_substring(self,list,substring,maxcount);
5909}
5910
5911static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912PyObject *replace(PyUnicodeObject *self,
5913 PyUnicodeObject *str1,
5914 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005915 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916{
5917 PyUnicodeObject *u;
5918
5919 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005920 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921
Thomas Wouters477c8d52006-05-27 19:21:47 +00005922 if (str1->length == str2->length) {
5923 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005924 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005925 if (str1->length == 1) {
5926 /* replace characters */
5927 Py_UNICODE u1, u2;
5928 if (!findchar(self->str, self->length, str1->str[0]))
5929 goto nothing;
5930 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5931 if (!u)
5932 return NULL;
5933 Py_UNICODE_COPY(u->str, self->str, self->length);
5934 u1 = str1->str[0];
5935 u2 = str2->str[0];
5936 for (i = 0; i < u->length; i++)
5937 if (u->str[i] == u1) {
5938 if (--maxcount < 0)
5939 break;
5940 u->str[i] = u2;
5941 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005943 i = fastsearch(
5944 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005946 if (i < 0)
5947 goto nothing;
5948 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5949 if (!u)
5950 return NULL;
5951 Py_UNICODE_COPY(u->str, self->str, self->length);
5952 while (i <= self->length - str1->length)
5953 if (Py_UNICODE_MATCH(self, i, str1)) {
5954 if (--maxcount < 0)
5955 break;
5956 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5957 i += str1->length;
5958 } else
5959 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005962
5963 Py_ssize_t n, i, j, e;
5964 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 Py_UNICODE *p;
5966
5967 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005968 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 if (n > maxcount)
5970 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005971 if (n == 0)
5972 goto nothing;
5973 /* new_size = self->length + n * (str2->length - str1->length)); */
5974 delta = (str2->length - str1->length);
5975 if (delta == 0) {
5976 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005978 product = n * (str2->length - str1->length);
5979 if ((product / (str2->length - str1->length)) != n) {
5980 PyErr_SetString(PyExc_OverflowError,
5981 "replace string is too long");
5982 return NULL;
5983 }
5984 new_size = self->length + product;
5985 if (new_size < 0) {
5986 PyErr_SetString(PyExc_OverflowError,
5987 "replace string is too long");
5988 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 }
5990 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005991 u = _PyUnicode_New(new_size);
5992 if (!u)
5993 return NULL;
5994 i = 0;
5995 p = u->str;
5996 e = self->length - str1->length;
5997 if (str1->length > 0) {
5998 while (n-- > 0) {
5999 /* look for next match */
6000 j = i;
6001 while (j <= e) {
6002 if (Py_UNICODE_MATCH(self, j, str1))
6003 break;
6004 j++;
6005 }
6006 if (j > i) {
6007 if (j > e)
6008 break;
6009 /* copy unchanged part [i:j] */
6010 Py_UNICODE_COPY(p, self->str+i, j-i);
6011 p += j - i;
6012 }
6013 /* copy substitution string */
6014 if (str2->length > 0) {
6015 Py_UNICODE_COPY(p, str2->str, str2->length);
6016 p += str2->length;
6017 }
6018 i = j + str1->length;
6019 }
6020 if (i < self->length)
6021 /* copy tail [i:] */
6022 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6023 } else {
6024 /* interleave */
6025 while (n > 0) {
6026 Py_UNICODE_COPY(p, str2->str, str2->length);
6027 p += str2->length;
6028 if (--n <= 0)
6029 break;
6030 *p++ = self->str[i++];
6031 }
6032 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006036
6037nothing:
6038 /* nothing to replace; return original string (when possible) */
6039 if (PyUnicode_CheckExact(self)) {
6040 Py_INCREF(self);
6041 return (PyObject *) self;
6042 }
6043 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044}
6045
6046/* --- Unicode Object Methods --------------------------------------------- */
6047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006048PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049"S.title() -> unicode\n\
6050\n\
6051Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006052characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053
6054static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006055unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 return fixup(self, fixtitle);
6058}
6059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006060PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061"S.capitalize() -> unicode\n\
6062\n\
6063Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006064have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065
6066static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006067unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 return fixup(self, fixcapitalize);
6070}
6071
6072#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006073PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074"S.capwords() -> unicode\n\
6075\n\
6076Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006077normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078
6079static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006080unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081{
6082 PyObject *list;
6083 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006084 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 /* Split into words */
6087 list = split(self, NULL, -1);
6088 if (!list)
6089 return NULL;
6090
6091 /* Capitalize each word */
6092 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6093 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6094 fixcapitalize);
6095 if (item == NULL)
6096 goto onError;
6097 Py_DECREF(PyList_GET_ITEM(list, i));
6098 PyList_SET_ITEM(list, i, item);
6099 }
6100
6101 /* Join the words to form a new string */
6102 item = PyUnicode_Join(NULL, list);
6103
6104onError:
6105 Py_DECREF(list);
6106 return (PyObject *)item;
6107}
6108#endif
6109
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006110/* Argument converter. Coerces to a single unicode character */
6111
6112static int
6113convert_uc(PyObject *obj, void *addr)
6114{
6115 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6116 PyObject *uniobj;
6117 Py_UNICODE *unistr;
6118
6119 uniobj = PyUnicode_FromObject(obj);
6120 if (uniobj == NULL) {
6121 PyErr_SetString(PyExc_TypeError,
6122 "The fill character cannot be converted to Unicode");
6123 return 0;
6124 }
6125 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6126 PyErr_SetString(PyExc_TypeError,
6127 "The fill character must be exactly one character long");
6128 Py_DECREF(uniobj);
6129 return 0;
6130 }
6131 unistr = PyUnicode_AS_UNICODE(uniobj);
6132 *fillcharloc = unistr[0];
6133 Py_DECREF(uniobj);
6134 return 1;
6135}
6136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006137PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006138"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006140Return S centered in a Unicode string of length width. Padding is\n\
6141done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142
6143static PyObject *
6144unicode_center(PyUnicodeObject *self, PyObject *args)
6145{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006146 Py_ssize_t marg, left;
6147 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006148 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149
Thomas Woutersde017742006-02-16 19:34:37 +00006150 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 return NULL;
6152
Tim Peters7a29bd52001-09-12 03:03:31 +00006153 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 Py_INCREF(self);
6155 return (PyObject*) self;
6156 }
6157
6158 marg = width - self->length;
6159 left = marg / 2 + (marg & width & 1);
6160
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006161 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162}
6163
Marc-André Lemburge5034372000-08-08 08:04:29 +00006164#if 0
6165
6166/* This code should go into some future Unicode collation support
6167 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006168 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006169
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006170/* speedy UTF-16 code point order comparison */
6171/* gleaned from: */
6172/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6173
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006174static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006175{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006176 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006177 0, 0, 0, 0, 0, 0, 0, 0,
6178 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006179 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006180};
6181
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182static int
6183unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6184{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006185 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006186
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 Py_UNICODE *s1 = str1->str;
6188 Py_UNICODE *s2 = str2->str;
6189
6190 len1 = str1->length;
6191 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006192
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006194 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006195
6196 c1 = *s1++;
6197 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006198
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006199 if (c1 > (1<<11) * 26)
6200 c1 += utf16Fixup[c1>>11];
6201 if (c2 > (1<<11) * 26)
6202 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006203 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006204
6205 if (c1 != c2)
6206 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006207
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006208 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209 }
6210
6211 return (len1 < len2) ? -1 : (len1 != len2);
6212}
6213
Marc-André Lemburge5034372000-08-08 08:04:29 +00006214#else
6215
6216static int
6217unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6218{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006219 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006220
6221 Py_UNICODE *s1 = str1->str;
6222 Py_UNICODE *s2 = str2->str;
6223
6224 len1 = str1->length;
6225 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006226
Marc-André Lemburge5034372000-08-08 08:04:29 +00006227 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006228 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006229
Fredrik Lundh45714e92001-06-26 16:39:36 +00006230 c1 = *s1++;
6231 c2 = *s2++;
6232
6233 if (c1 != c2)
6234 return (c1 < c2) ? -1 : 1;
6235
Marc-André Lemburge5034372000-08-08 08:04:29 +00006236 len1--; len2--;
6237 }
6238
6239 return (len1 < len2) ? -1 : (len1 != len2);
6240}
6241
6242#endif
6243
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244int PyUnicode_Compare(PyObject *left,
6245 PyObject *right)
6246{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006247 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6248 return unicode_compare((PyUnicodeObject *)left,
6249 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006250 PyErr_Format(PyExc_TypeError,
6251 "Can't compare %.100s and %.100s",
6252 left->ob_type->tp_name,
6253 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 return -1;
6255}
6256
Martin v. Löwis5b222132007-06-10 09:51:05 +00006257int
6258PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6259{
6260 int i;
6261 Py_UNICODE *id;
6262 assert(PyUnicode_Check(uni));
6263 id = PyUnicode_AS_UNICODE(uni);
6264 /* Compare Unicode string and source character set string */
6265 for (i = 0; id[i] && str[i]; i++)
6266 if (id[i] != str[i])
6267 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6268 if (id[i])
6269 return 1; /* uni is longer */
6270 if (str[i])
6271 return -1; /* str is longer */
6272 return 0;
6273}
6274
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006275PyObject *PyUnicode_RichCompare(PyObject *left,
6276 PyObject *right,
6277 int op)
6278{
6279 int result;
6280
6281 result = PyUnicode_Compare(left, right);
6282 if (result == -1 && PyErr_Occurred())
6283 goto onError;
6284
6285 /* Convert the return value to a Boolean */
6286 switch (op) {
6287 case Py_EQ:
6288 result = (result == 0);
6289 break;
6290 case Py_NE:
6291 result = (result != 0);
6292 break;
6293 case Py_LE:
6294 result = (result <= 0);
6295 break;
6296 case Py_GE:
6297 result = (result >= 0);
6298 break;
6299 case Py_LT:
6300 result = (result == -1);
6301 break;
6302 case Py_GT:
6303 result = (result == 1);
6304 break;
6305 }
6306 return PyBool_FromLong(result);
6307
6308 onError:
6309
6310 /* Standard case
6311
6312 Type errors mean that PyUnicode_FromObject() could not convert
6313 one of the arguments (usually the right hand side) to Unicode,
6314 ie. we can't handle the comparison request. However, it is
6315 possible that the other object knows a comparison method, which
6316 is why we return Py_NotImplemented to give the other object a
6317 chance.
6318
6319 */
6320 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6321 PyErr_Clear();
6322 Py_INCREF(Py_NotImplemented);
6323 return Py_NotImplemented;
6324 }
6325 if (op != Py_EQ && op != Py_NE)
6326 return NULL;
6327
6328 /* Equality comparison.
6329
6330 This is a special case: we silence any PyExc_UnicodeDecodeError
6331 and instead turn it into a PyErr_UnicodeWarning.
6332
6333 */
6334 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6335 return NULL;
6336 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006337 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6338 (op == Py_EQ) ?
6339 "Unicode equal comparison "
6340 "failed to convert both arguments to Unicode - "
6341 "interpreting them as being unequal"
6342 :
6343 "Unicode unequal comparison "
6344 "failed to convert both arguments to Unicode - "
6345 "interpreting them as being unequal",
6346 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006347 return NULL;
6348 result = (op == Py_NE);
6349 return PyBool_FromLong(result);
6350}
6351
Guido van Rossum403d68b2000-03-13 15:55:09 +00006352int PyUnicode_Contains(PyObject *container,
6353 PyObject *element)
6354{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006355 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006356 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006357
6358 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006359 sub = PyUnicode_FromObject(element);
6360 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006361 PyErr_Format(PyExc_TypeError,
6362 "'in <string>' requires string as left operand, not %s",
6363 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006364 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006365 }
6366
Thomas Wouters477c8d52006-05-27 19:21:47 +00006367 str = PyUnicode_FromObject(container);
6368 if (!str) {
6369 Py_DECREF(sub);
6370 return -1;
6371 }
6372
6373 result = stringlib_contains_obj(str, sub);
6374
6375 Py_DECREF(str);
6376 Py_DECREF(sub);
6377
Guido van Rossum403d68b2000-03-13 15:55:09 +00006378 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006379}
6380
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381/* Concat to string or Unicode object giving a new Unicode object. */
6382
6383PyObject *PyUnicode_Concat(PyObject *left,
6384 PyObject *right)
6385{
6386 PyUnicodeObject *u = NULL, *v = NULL, *w;
6387
6388 /* Coerce the two arguments */
6389 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6390 if (u == NULL)
6391 goto onError;
6392 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6393 if (v == NULL)
6394 goto onError;
6395
6396 /* Shortcuts */
6397 if (v == unicode_empty) {
6398 Py_DECREF(v);
6399 return (PyObject *)u;
6400 }
6401 if (u == unicode_empty) {
6402 Py_DECREF(u);
6403 return (PyObject *)v;
6404 }
6405
6406 /* Concat the two Unicode strings */
6407 w = _PyUnicode_New(u->length + v->length);
6408 if (w == NULL)
6409 goto onError;
6410 Py_UNICODE_COPY(w->str, u->str, u->length);
6411 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6412
6413 Py_DECREF(u);
6414 Py_DECREF(v);
6415 return (PyObject *)w;
6416
6417onError:
6418 Py_XDECREF(u);
6419 Py_XDECREF(v);
6420 return NULL;
6421}
6422
Walter Dörwald1ab83302007-05-18 17:15:44 +00006423void
6424PyUnicode_Append(PyObject **pleft, PyObject *right)
6425{
6426 PyObject *new;
6427 if (*pleft == NULL)
6428 return;
6429 if (right == NULL || !PyUnicode_Check(*pleft)) {
6430 Py_DECREF(*pleft);
6431 *pleft = NULL;
6432 return;
6433 }
6434 new = PyUnicode_Concat(*pleft, right);
6435 Py_DECREF(*pleft);
6436 *pleft = new;
6437}
6438
6439void
6440PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6441{
6442 PyUnicode_Append(pleft, right);
6443 Py_XDECREF(right);
6444}
6445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006446PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447"S.count(sub[, start[, end]]) -> int\n\
6448\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006449Return the number of non-overlapping occurrences of substring sub in\n\
6450Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006451interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452
6453static PyObject *
6454unicode_count(PyUnicodeObject *self, PyObject *args)
6455{
6456 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006457 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006458 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 PyObject *result;
6460
Guido van Rossumb8872e62000-05-09 14:14:27 +00006461 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6462 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 return NULL;
6464
6465 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006466 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 if (substring == NULL)
6468 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006469
Thomas Wouters477c8d52006-05-27 19:21:47 +00006470 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
Thomas Wouters477c8d52006-05-27 19:21:47 +00006472 result = PyInt_FromSsize_t(
6473 stringlib_count(self->str + start, end - start,
6474 substring->str, substring->length)
6475 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476
6477 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006478
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 return result;
6480}
6481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006482PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006483"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006485Encodes S using the codec registered for encoding. encoding defaults\n\
6486to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006487handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006488a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6489'xmlcharrefreplace' as well as any other name registered with\n\
6490codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491
6492static PyObject *
6493unicode_encode(PyUnicodeObject *self, PyObject *args)
6494{
6495 char *encoding = NULL;
6496 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006497 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006498
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6500 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006501 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006502 if (v == NULL)
6503 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00006504 if (!PyString_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006505 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006506 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006507 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006508 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006509 Py_DECREF(v);
6510 return NULL;
6511 }
6512 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006513
6514 onError:
6515 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006516}
6517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006518PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519"S.expandtabs([tabsize]) -> unicode\n\
6520\n\
6521Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006522If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523
6524static PyObject*
6525unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6526{
6527 Py_UNICODE *e;
6528 Py_UNICODE *p;
6529 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006530 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 PyUnicodeObject *u;
6532 int tabsize = 8;
6533
6534 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6535 return NULL;
6536
Thomas Wouters7e474022000-07-16 12:04:32 +00006537 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006538 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 e = self->str + self->length;
6540 for (p = self->str; p < e; p++)
6541 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006542 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006544 if (old_j > j) {
6545 PyErr_SetString(PyExc_OverflowError,
6546 "new string is too long");
6547 return NULL;
6548 }
6549 old_j = j;
6550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 }
6552 else {
6553 j++;
6554 if (*p == '\n' || *p == '\r') {
6555 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006556 old_j = j = 0;
6557 if (i < 0) {
6558 PyErr_SetString(PyExc_OverflowError,
6559 "new string is too long");
6560 return NULL;
6561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 }
6563 }
6564
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006565 if ((i + j) < 0) {
6566 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6567 return NULL;
6568 }
6569
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 /* Second pass: create output string and fill it */
6571 u = _PyUnicode_New(i + j);
6572 if (!u)
6573 return NULL;
6574
6575 j = 0;
6576 q = u->str;
6577
6578 for (p = self->str; p < e; p++)
6579 if (*p == '\t') {
6580 if (tabsize > 0) {
6581 i = tabsize - (j % tabsize);
6582 j += i;
6583 while (i--)
6584 *q++ = ' ';
6585 }
6586 }
6587 else {
6588 j++;
6589 *q++ = *p;
6590 if (*p == '\n' || *p == '\r')
6591 j = 0;
6592 }
6593
6594 return (PyObject*) u;
6595}
6596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006597PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598"S.find(sub [,start [,end]]) -> int\n\
6599\n\
6600Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006601such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602arguments start and end are interpreted as in slice notation.\n\
6603\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006604Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605
6606static PyObject *
6607unicode_find(PyUnicodeObject *self, PyObject *args)
6608{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006609 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006610 Py_ssize_t start;
6611 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006612 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613
Christian Heimes9cd17752007-11-18 19:35:23 +00006614 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616
Thomas Wouters477c8d52006-05-27 19:21:47 +00006617 result = stringlib_find_slice(
6618 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6619 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6620 start, end
6621 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622
6623 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006624
6625 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626}
6627
6628static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006629unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630{
6631 if (index < 0 || index >= self->length) {
6632 PyErr_SetString(PyExc_IndexError, "string index out of range");
6633 return NULL;
6634 }
6635
6636 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6637}
6638
Guido van Rossumc2504932007-09-18 19:42:40 +00006639/* Believe it or not, this produces the same value for ASCII strings
6640 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006642unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643{
Guido van Rossumc2504932007-09-18 19:42:40 +00006644 Py_ssize_t len;
6645 Py_UNICODE *p;
6646 long x;
6647
6648 if (self->hash != -1)
6649 return self->hash;
6650 len = Py_Size(self);
6651 p = self->str;
6652 x = *p << 7;
6653 while (--len >= 0)
6654 x = (1000003*x) ^ *p++;
6655 x ^= Py_Size(self);
6656 if (x == -1)
6657 x = -2;
6658 self->hash = x;
6659 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660}
6661
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006662PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663"S.index(sub [,start [,end]]) -> int\n\
6664\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006665Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666
6667static PyObject *
6668unicode_index(PyUnicodeObject *self, PyObject *args)
6669{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006670 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006671 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006672 Py_ssize_t start;
6673 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674
Christian Heimes9cd17752007-11-18 19:35:23 +00006675 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677
Thomas Wouters477c8d52006-05-27 19:21:47 +00006678 result = stringlib_find_slice(
6679 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6680 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6681 start, end
6682 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683
6684 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006685
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 if (result < 0) {
6687 PyErr_SetString(PyExc_ValueError, "substring not found");
6688 return NULL;
6689 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006690
Martin v. Löwis18e16552006-02-15 17:27:45 +00006691 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692}
6693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006694PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006695"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006697Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006698at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699
6700static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006701unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702{
6703 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6704 register const Py_UNICODE *e;
6705 int cased;
6706
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707 /* Shortcut for single character strings */
6708 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006709 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006711 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006712 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006713 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006714
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715 e = p + PyUnicode_GET_SIZE(self);
6716 cased = 0;
6717 for (; p < e; p++) {
6718 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006719
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006721 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 else if (!cased && Py_UNICODE_ISLOWER(ch))
6723 cased = 1;
6724 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006725 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726}
6727
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006728PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006729"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006731Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006732at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733
6734static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006735unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736{
6737 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6738 register const Py_UNICODE *e;
6739 int cased;
6740
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 /* Shortcut for single character strings */
6742 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006743 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006745 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006746 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006747 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006748
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 e = p + PyUnicode_GET_SIZE(self);
6750 cased = 0;
6751 for (; p < e; p++) {
6752 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006753
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006755 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 else if (!cased && Py_UNICODE_ISUPPER(ch))
6757 cased = 1;
6758 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006759 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760}
6761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006762PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006763"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006765Return True if S is a titlecased string and there is at least one\n\
6766character in S, i.e. upper- and titlecase characters may only\n\
6767follow uncased characters and lowercase characters only cased ones.\n\
6768Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769
6770static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006771unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772{
6773 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6774 register const Py_UNICODE *e;
6775 int cased, previous_is_cased;
6776
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 /* Shortcut for single character strings */
6778 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006779 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6780 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006782 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006783 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006784 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006785
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 e = p + PyUnicode_GET_SIZE(self);
6787 cased = 0;
6788 previous_is_cased = 0;
6789 for (; p < e; p++) {
6790 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006791
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6793 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006794 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 previous_is_cased = 1;
6796 cased = 1;
6797 }
6798 else if (Py_UNICODE_ISLOWER(ch)) {
6799 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006800 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801 previous_is_cased = 1;
6802 cased = 1;
6803 }
6804 else
6805 previous_is_cased = 0;
6806 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006807 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808}
6809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006810PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006811"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006813Return True if all characters in S are whitespace\n\
6814and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815
6816static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006817unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818{
6819 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6820 register const Py_UNICODE *e;
6821
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 /* Shortcut for single character strings */
6823 if (PyUnicode_GET_SIZE(self) == 1 &&
6824 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006825 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006827 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006828 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006829 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006830
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 e = p + PyUnicode_GET_SIZE(self);
6832 for (; p < e; p++) {
6833 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006834 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006836 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837}
6838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006839PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006840"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006841\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006842Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006843and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006844
6845static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006846unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006847{
6848 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6849 register const Py_UNICODE *e;
6850
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006851 /* Shortcut for single character strings */
6852 if (PyUnicode_GET_SIZE(self) == 1 &&
6853 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006854 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006855
6856 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006857 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006858 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006859
6860 e = p + PyUnicode_GET_SIZE(self);
6861 for (; p < e; p++) {
6862 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006863 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006864 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006865 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006866}
6867
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006868PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006869"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006870\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006871Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006872and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006873
6874static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006875unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006876{
6877 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6878 register const Py_UNICODE *e;
6879
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006880 /* Shortcut for single character strings */
6881 if (PyUnicode_GET_SIZE(self) == 1 &&
6882 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006883 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006884
6885 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006886 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006887 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006888
6889 e = p + PyUnicode_GET_SIZE(self);
6890 for (; p < e; p++) {
6891 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006892 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006893 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006894 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006895}
6896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006897PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006898"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006900Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006901False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902
6903static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006904unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905{
6906 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6907 register const Py_UNICODE *e;
6908
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 /* Shortcut for single character strings */
6910 if (PyUnicode_GET_SIZE(self) == 1 &&
6911 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006912 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006914 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006915 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006916 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006917
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918 e = p + PyUnicode_GET_SIZE(self);
6919 for (; p < e; p++) {
6920 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006921 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006923 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924}
6925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006926PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006927"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006929Return True if all characters in S are digits\n\
6930and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931
6932static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006933unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934{
6935 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6936 register const Py_UNICODE *e;
6937
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 /* Shortcut for single character strings */
6939 if (PyUnicode_GET_SIZE(self) == 1 &&
6940 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006941 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006943 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006944 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006945 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006946
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 e = p + PyUnicode_GET_SIZE(self);
6948 for (; p < e; p++) {
6949 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006950 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006952 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953}
6954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006955PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006956"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006958Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006959False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960
6961static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006962unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963{
6964 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6965 register const Py_UNICODE *e;
6966
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967 /* Shortcut for single character strings */
6968 if (PyUnicode_GET_SIZE(self) == 1 &&
6969 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006970 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006972 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006973 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006974 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006975
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976 e = p + PyUnicode_GET_SIZE(self);
6977 for (; p < e; p++) {
6978 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006979 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006981 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982}
6983
Martin v. Löwis47383402007-08-15 07:32:56 +00006984int
6985PyUnicode_IsIdentifier(PyObject *self)
6986{
6987 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6988 register const Py_UNICODE *e;
6989
6990 /* Special case for empty strings */
6991 if (PyUnicode_GET_SIZE(self) == 0)
6992 return 0;
6993
6994 /* PEP 3131 says that the first character must be in
6995 XID_Start and subsequent characters in XID_Continue,
6996 and for the ASCII range, the 2.x rules apply (i.e
6997 start with letters and underscore, continue with
6998 letters, digits, underscore). However, given the current
6999 definition of XID_Start and XID_Continue, it is sufficient
7000 to check just for these, except that _ must be allowed
7001 as starting an identifier. */
7002 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7003 return 0;
7004
7005 e = p + PyUnicode_GET_SIZE(self);
7006 for (p++; p < e; p++) {
7007 if (!_PyUnicode_IsXidContinue(*p))
7008 return 0;
7009 }
7010 return 1;
7011}
7012
7013PyDoc_STRVAR(isidentifier__doc__,
7014"S.isidentifier() -> bool\n\
7015\n\
7016Return True if S is a valid identifier according\n\
7017to the language definition.");
7018
7019static PyObject*
7020unicode_isidentifier(PyObject *self)
7021{
7022 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7023}
7024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007025PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026"S.join(sequence) -> unicode\n\
7027\n\
7028Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007029sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030
7031static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007032unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007034 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035}
7036
Martin v. Löwis18e16552006-02-15 17:27:45 +00007037static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038unicode_length(PyUnicodeObject *self)
7039{
7040 return self->length;
7041}
7042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007043PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007044"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045\n\
7046Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007047done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048
7049static PyObject *
7050unicode_ljust(PyUnicodeObject *self, PyObject *args)
7051{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007052 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007053 Py_UNICODE fillchar = ' ';
7054
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007055 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056 return NULL;
7057
Tim Peters7a29bd52001-09-12 03:03:31 +00007058 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 Py_INCREF(self);
7060 return (PyObject*) self;
7061 }
7062
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007063 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064}
7065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007066PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067"S.lower() -> unicode\n\
7068\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007069Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070
7071static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007072unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 return fixup(self, fixlower);
7075}
7076
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007077#define LEFTSTRIP 0
7078#define RIGHTSTRIP 1
7079#define BOTHSTRIP 2
7080
7081/* Arrays indexed by above */
7082static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7083
7084#define STRIPNAME(i) (stripformat[i]+3)
7085
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007086/* externally visible for str.strip(unicode) */
7087PyObject *
7088_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7089{
7090 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007091 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007092 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007093 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7094 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007095
Thomas Wouters477c8d52006-05-27 19:21:47 +00007096 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7097
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007098 i = 0;
7099 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007100 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7101 i++;
7102 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007103 }
7104
7105 j = len;
7106 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007107 do {
7108 j--;
7109 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7110 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007111 }
7112
7113 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007114 Py_INCREF(self);
7115 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007116 }
7117 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007118 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007119}
7120
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121
7122static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007123do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007125 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007126 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007127
7128 i = 0;
7129 if (striptype != RIGHTSTRIP) {
7130 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7131 i++;
7132 }
7133 }
7134
7135 j = len;
7136 if (striptype != LEFTSTRIP) {
7137 do {
7138 j--;
7139 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7140 j++;
7141 }
7142
7143 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7144 Py_INCREF(self);
7145 return (PyObject*)self;
7146 }
7147 else
7148 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149}
7150
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007151
7152static PyObject *
7153do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7154{
7155 PyObject *sep = NULL;
7156
7157 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7158 return NULL;
7159
7160 if (sep != NULL && sep != Py_None) {
7161 if (PyUnicode_Check(sep))
7162 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007163 else {
7164 PyErr_Format(PyExc_TypeError,
7165 "%s arg must be None, unicode or str",
7166 STRIPNAME(striptype));
7167 return NULL;
7168 }
7169 }
7170
7171 return do_strip(self, striptype);
7172}
7173
7174
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007175PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007176"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007177\n\
7178Return a copy of the string S with leading and trailing\n\
7179whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007180If chars is given and not None, remove characters in chars instead.\n\
7181If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007182
7183static PyObject *
7184unicode_strip(PyUnicodeObject *self, PyObject *args)
7185{
7186 if (PyTuple_GET_SIZE(args) == 0)
7187 return do_strip(self, BOTHSTRIP); /* Common case */
7188 else
7189 return do_argstrip(self, BOTHSTRIP, args);
7190}
7191
7192
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007193PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007194"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007195\n\
7196Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007197If chars is given and not None, remove characters in chars instead.\n\
7198If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007199
7200static PyObject *
7201unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7202{
7203 if (PyTuple_GET_SIZE(args) == 0)
7204 return do_strip(self, LEFTSTRIP); /* Common case */
7205 else
7206 return do_argstrip(self, LEFTSTRIP, args);
7207}
7208
7209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007210PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007211"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007212\n\
7213Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007214If chars is given and not None, remove characters in chars instead.\n\
7215If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007216
7217static PyObject *
7218unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7219{
7220 if (PyTuple_GET_SIZE(args) == 0)
7221 return do_strip(self, RIGHTSTRIP); /* Common case */
7222 else
7223 return do_argstrip(self, RIGHTSTRIP, args);
7224}
7225
7226
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007228unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229{
7230 PyUnicodeObject *u;
7231 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007232 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007233 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234
7235 if (len < 0)
7236 len = 0;
7237
Tim Peters7a29bd52001-09-12 03:03:31 +00007238 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 /* no repeat, return original string */
7240 Py_INCREF(str);
7241 return (PyObject*) str;
7242 }
Tim Peters8f422462000-09-09 06:13:41 +00007243
7244 /* ensure # of chars needed doesn't overflow int and # of bytes
7245 * needed doesn't overflow size_t
7246 */
7247 nchars = len * str->length;
7248 if (len && nchars / len != str->length) {
7249 PyErr_SetString(PyExc_OverflowError,
7250 "repeated string is too long");
7251 return NULL;
7252 }
7253 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7254 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7255 PyErr_SetString(PyExc_OverflowError,
7256 "repeated string is too long");
7257 return NULL;
7258 }
7259 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 if (!u)
7261 return NULL;
7262
7263 p = u->str;
7264
Thomas Wouters477c8d52006-05-27 19:21:47 +00007265 if (str->length == 1 && len > 0) {
7266 Py_UNICODE_FILL(p, str->str[0], len);
7267 } else {
7268 Py_ssize_t done = 0; /* number of characters copied this far */
7269 if (done < nchars) {
7270 Py_UNICODE_COPY(p, str->str, str->length);
7271 done = str->length;
7272 }
7273 while (done < nchars) {
7274 int n = (done <= nchars-done) ? done : nchars-done;
7275 Py_UNICODE_COPY(p+done, p, n);
7276 done += n;
7277 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278 }
7279
7280 return (PyObject*) u;
7281}
7282
7283PyObject *PyUnicode_Replace(PyObject *obj,
7284 PyObject *subobj,
7285 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007286 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287{
7288 PyObject *self;
7289 PyObject *str1;
7290 PyObject *str2;
7291 PyObject *result;
7292
7293 self = PyUnicode_FromObject(obj);
7294 if (self == NULL)
7295 return NULL;
7296 str1 = PyUnicode_FromObject(subobj);
7297 if (str1 == NULL) {
7298 Py_DECREF(self);
7299 return NULL;
7300 }
7301 str2 = PyUnicode_FromObject(replobj);
7302 if (str2 == NULL) {
7303 Py_DECREF(self);
7304 Py_DECREF(str1);
7305 return NULL;
7306 }
Tim Petersced69f82003-09-16 20:30:58 +00007307 result = replace((PyUnicodeObject *)self,
7308 (PyUnicodeObject *)str1,
7309 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 maxcount);
7311 Py_DECREF(self);
7312 Py_DECREF(str1);
7313 Py_DECREF(str2);
7314 return result;
7315}
7316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007317PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318"S.replace (old, new[, maxsplit]) -> unicode\n\
7319\n\
7320Return a copy of S with all occurrences of substring\n\
7321old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007322given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323
7324static PyObject*
7325unicode_replace(PyUnicodeObject *self, PyObject *args)
7326{
7327 PyUnicodeObject *str1;
7328 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007329 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 PyObject *result;
7331
Martin v. Löwis18e16552006-02-15 17:27:45 +00007332 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333 return NULL;
7334 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7335 if (str1 == NULL)
7336 return NULL;
7337 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007338 if (str2 == NULL) {
7339 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007341 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342
7343 result = replace(self, str1, str2, maxcount);
7344
7345 Py_DECREF(str1);
7346 Py_DECREF(str2);
7347 return result;
7348}
7349
7350static
7351PyObject *unicode_repr(PyObject *unicode)
7352{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007353 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007354 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007355 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7356 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7357
7358 /* XXX(nnorwitz): rather than over-allocating, it would be
7359 better to choose a different scheme. Perhaps scan the
7360 first N-chars of the string and allocate based on that size.
7361 */
7362 /* Initial allocation is based on the longest-possible unichr
7363 escape.
7364
7365 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7366 unichr, so in this case it's the longest unichr escape. In
7367 narrow (UTF-16) builds this is five chars per source unichr
7368 since there are two unichrs in the surrogate pair, so in narrow
7369 (UTF-16) builds it's not the longest unichr escape.
7370
7371 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7372 so in the narrow (UTF-16) build case it's the longest unichr
7373 escape.
7374 */
7375
Walter Dörwald1ab83302007-05-18 17:15:44 +00007376 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007377 2 /* quotes */
7378#ifdef Py_UNICODE_WIDE
7379 + 10*size
7380#else
7381 + 6*size
7382#endif
7383 + 1);
7384 if (repr == NULL)
7385 return NULL;
7386
Walter Dörwald1ab83302007-05-18 17:15:44 +00007387 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007388
7389 /* Add quote */
7390 *p++ = (findchar(s, size, '\'') &&
7391 !findchar(s, size, '"')) ? '"' : '\'';
7392 while (size-- > 0) {
7393 Py_UNICODE ch = *s++;
7394
7395 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007396 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007397 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007398 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007399 continue;
7400 }
7401
7402#ifdef Py_UNICODE_WIDE
7403 /* Map 21-bit characters to '\U00xxxxxx' */
7404 else if (ch >= 0x10000) {
7405 *p++ = '\\';
7406 *p++ = 'U';
7407 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7408 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7409 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7410 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7411 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7412 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7413 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7414 *p++ = hexdigits[ch & 0x0000000F];
7415 continue;
7416 }
7417#else
7418 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7419 else if (ch >= 0xD800 && ch < 0xDC00) {
7420 Py_UNICODE ch2;
7421 Py_UCS4 ucs;
7422
7423 ch2 = *s++;
7424 size--;
7425 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7426 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7427 *p++ = '\\';
7428 *p++ = 'U';
7429 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7430 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7431 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7432 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7433 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7434 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7435 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7436 *p++ = hexdigits[ucs & 0x0000000F];
7437 continue;
7438 }
7439 /* Fall through: isolated surrogates are copied as-is */
7440 s--;
7441 size++;
7442 }
7443#endif
7444
7445 /* Map 16-bit characters to '\uxxxx' */
7446 if (ch >= 256) {
7447 *p++ = '\\';
7448 *p++ = 'u';
7449 *p++ = hexdigits[(ch >> 12) & 0x000F];
7450 *p++ = hexdigits[(ch >> 8) & 0x000F];
7451 *p++ = hexdigits[(ch >> 4) & 0x000F];
7452 *p++ = hexdigits[ch & 0x000F];
7453 }
7454
7455 /* Map special whitespace to '\t', \n', '\r' */
7456 else if (ch == '\t') {
7457 *p++ = '\\';
7458 *p++ = 't';
7459 }
7460 else if (ch == '\n') {
7461 *p++ = '\\';
7462 *p++ = 'n';
7463 }
7464 else if (ch == '\r') {
7465 *p++ = '\\';
7466 *p++ = 'r';
7467 }
7468
7469 /* Map non-printable US ASCII to '\xhh' */
7470 else if (ch < ' ' || ch >= 0x7F) {
7471 *p++ = '\\';
7472 *p++ = 'x';
7473 *p++ = hexdigits[(ch >> 4) & 0x000F];
7474 *p++ = hexdigits[ch & 0x000F];
7475 }
7476
7477 /* Copy everything else as-is */
7478 else
7479 *p++ = (char) ch;
7480 }
7481 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007482 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007483
7484 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007485 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007486 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487}
7488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007489PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490"S.rfind(sub [,start [,end]]) -> int\n\
7491\n\
7492Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007493such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494arguments start and end are interpreted as in slice notation.\n\
7495\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007496Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497
7498static PyObject *
7499unicode_rfind(PyUnicodeObject *self, PyObject *args)
7500{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007501 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007502 Py_ssize_t start;
7503 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007504 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505
Christian Heimes9cd17752007-11-18 19:35:23 +00007506 if (!_ParseTupleFinds(args, &substring, &start, &end))
7507 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508
Thomas Wouters477c8d52006-05-27 19:21:47 +00007509 result = stringlib_rfind_slice(
7510 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7511 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7512 start, end
7513 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514
7515 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007516
7517 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518}
7519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007520PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521"S.rindex(sub [,start [,end]]) -> int\n\
7522\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007523Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524
7525static PyObject *
7526unicode_rindex(PyUnicodeObject *self, PyObject *args)
7527{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007528 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007529 Py_ssize_t start;
7530 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007531 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532
Christian Heimes9cd17752007-11-18 19:35:23 +00007533 if (!_ParseTupleFinds(args, &substring, &start, &end))
7534 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535
Thomas Wouters477c8d52006-05-27 19:21:47 +00007536 result = stringlib_rfind_slice(
7537 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7538 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7539 start, end
7540 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541
7542 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007543
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544 if (result < 0) {
7545 PyErr_SetString(PyExc_ValueError, "substring not found");
7546 return NULL;
7547 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007548 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549}
7550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007551PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007552"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553\n\
7554Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007555done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556
7557static PyObject *
7558unicode_rjust(PyUnicodeObject *self, PyObject *args)
7559{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007560 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007561 Py_UNICODE fillchar = ' ';
7562
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007563 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 return NULL;
7565
Tim Peters7a29bd52001-09-12 03:03:31 +00007566 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567 Py_INCREF(self);
7568 return (PyObject*) self;
7569 }
7570
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007571 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572}
7573
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574PyObject *PyUnicode_Split(PyObject *s,
7575 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007576 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577{
7578 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007579
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580 s = PyUnicode_FromObject(s);
7581 if (s == NULL)
7582 return NULL;
7583 if (sep != NULL) {
7584 sep = PyUnicode_FromObject(sep);
7585 if (sep == NULL) {
7586 Py_DECREF(s);
7587 return NULL;
7588 }
7589 }
7590
7591 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7592
7593 Py_DECREF(s);
7594 Py_XDECREF(sep);
7595 return result;
7596}
7597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007598PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599"S.split([sep [,maxsplit]]) -> list of strings\n\
7600\n\
7601Return a list of the words in S, using sep as the\n\
7602delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007603splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007604any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605
7606static PyObject*
7607unicode_split(PyUnicodeObject *self, PyObject *args)
7608{
7609 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007610 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611
Martin v. Löwis18e16552006-02-15 17:27:45 +00007612 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 return NULL;
7614
7615 if (substring == Py_None)
7616 return split(self, NULL, maxcount);
7617 else if (PyUnicode_Check(substring))
7618 return split(self, (PyUnicodeObject *)substring, maxcount);
7619 else
7620 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7621}
7622
Thomas Wouters477c8d52006-05-27 19:21:47 +00007623PyObject *
7624PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7625{
7626 PyObject* str_obj;
7627 PyObject* sep_obj;
7628 PyObject* out;
7629
7630 str_obj = PyUnicode_FromObject(str_in);
7631 if (!str_obj)
7632 return NULL;
7633 sep_obj = PyUnicode_FromObject(sep_in);
7634 if (!sep_obj) {
7635 Py_DECREF(str_obj);
7636 return NULL;
7637 }
7638
7639 out = stringlib_partition(
7640 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7641 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7642 );
7643
7644 Py_DECREF(sep_obj);
7645 Py_DECREF(str_obj);
7646
7647 return out;
7648}
7649
7650
7651PyObject *
7652PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7653{
7654 PyObject* str_obj;
7655 PyObject* sep_obj;
7656 PyObject* out;
7657
7658 str_obj = PyUnicode_FromObject(str_in);
7659 if (!str_obj)
7660 return NULL;
7661 sep_obj = PyUnicode_FromObject(sep_in);
7662 if (!sep_obj) {
7663 Py_DECREF(str_obj);
7664 return NULL;
7665 }
7666
7667 out = stringlib_rpartition(
7668 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7669 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7670 );
7671
7672 Py_DECREF(sep_obj);
7673 Py_DECREF(str_obj);
7674
7675 return out;
7676}
7677
7678PyDoc_STRVAR(partition__doc__,
7679"S.partition(sep) -> (head, sep, tail)\n\
7680\n\
7681Searches for the separator sep in S, and returns the part before it,\n\
7682the separator itself, and the part after it. If the separator is not\n\
7683found, returns S and two empty strings.");
7684
7685static PyObject*
7686unicode_partition(PyUnicodeObject *self, PyObject *separator)
7687{
7688 return PyUnicode_Partition((PyObject *)self, separator);
7689}
7690
7691PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007692"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007693\n\
7694Searches for the separator sep in S, starting at the end of S, and returns\n\
7695the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007696separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007697
7698static PyObject*
7699unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7700{
7701 return PyUnicode_RPartition((PyObject *)self, separator);
7702}
7703
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007704PyObject *PyUnicode_RSplit(PyObject *s,
7705 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007706 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007707{
7708 PyObject *result;
7709
7710 s = PyUnicode_FromObject(s);
7711 if (s == NULL)
7712 return NULL;
7713 if (sep != NULL) {
7714 sep = PyUnicode_FromObject(sep);
7715 if (sep == NULL) {
7716 Py_DECREF(s);
7717 return NULL;
7718 }
7719 }
7720
7721 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7722
7723 Py_DECREF(s);
7724 Py_XDECREF(sep);
7725 return result;
7726}
7727
7728PyDoc_STRVAR(rsplit__doc__,
7729"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7730\n\
7731Return a list of the words in S, using sep as the\n\
7732delimiter string, starting at the end of the string and\n\
7733working to the front. If maxsplit is given, at most maxsplit\n\
7734splits are done. If sep is not specified, any whitespace string\n\
7735is a separator.");
7736
7737static PyObject*
7738unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7739{
7740 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007741 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007742
Martin v. Löwis18e16552006-02-15 17:27:45 +00007743 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007744 return NULL;
7745
7746 if (substring == Py_None)
7747 return rsplit(self, NULL, maxcount);
7748 else if (PyUnicode_Check(substring))
7749 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7750 else
7751 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7752}
7753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007754PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007755"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756\n\
7757Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007758Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007759is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760
7761static PyObject*
7762unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7763{
Guido van Rossum86662912000-04-11 15:38:46 +00007764 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765
Guido van Rossum86662912000-04-11 15:38:46 +00007766 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767 return NULL;
7768
Guido van Rossum86662912000-04-11 15:38:46 +00007769 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770}
7771
7772static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007773PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774{
Walter Dörwald346737f2007-05-31 10:44:43 +00007775 if (PyUnicode_CheckExact(self)) {
7776 Py_INCREF(self);
7777 return self;
7778 } else
7779 /* Subtype -- return genuine unicode string with the same value. */
7780 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7781 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782}
7783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007784PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785"S.swapcase() -> unicode\n\
7786\n\
7787Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007788and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789
7790static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007791unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 return fixup(self, fixswapcase);
7794}
7795
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007796PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797"S.translate(table) -> unicode\n\
7798\n\
7799Return a copy of the string S, where all characters have been mapped\n\
7800through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007801Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7802Unmapped characters are left untouched. Characters mapped to None\n\
7803are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804
7805static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007806unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807{
Georg Brandl94c2c752007-10-23 06:52:59 +00007808 PyObject *newtable = NULL;
7809 Py_ssize_t i = 0;
7810 PyObject *key, *value, *result;
7811
7812 if (!PyDict_Check(table)) {
7813 PyErr_SetString(PyExc_TypeError, "translate argument must be a dict");
7814 return NULL;
7815 }
7816 /* fixup the table -- allow size-1 string keys instead of only int keys */
7817 newtable = PyDict_Copy(table);
7818 if (!newtable) return NULL;
7819 while (PyDict_Next(table, &i, &key, &value)) {
7820 if (PyUnicode_Check(key)) {
7821 /* convert string keys to integer keys */
7822 PyObject *newkey;
7823 int res;
7824 if (PyUnicode_GET_SIZE(key) != 1) {
7825 PyErr_SetString(PyExc_ValueError, "string items in translate "
7826 "table must be 1 element long");
7827 goto err;
7828 }
7829 newkey = PyInt_FromLong(PyUnicode_AS_UNICODE(key)[0]);
7830 if (!newkey)
7831 goto err;
7832 res = PyDict_SetItem(newtable, newkey, value);
7833 Py_DECREF(newkey);
7834 if (res < 0)
7835 goto err;
7836 } else if (PyInt_Check(key)) {
7837 /* just keep integer keys */
7838 if (PyDict_SetItem(newtable, key, value) < 0)
7839 goto err;
7840 } else {
7841 PyErr_SetString(PyExc_TypeError, "items in translate table must be "
7842 "strings or integers");
7843 goto err;
7844 }
7845 }
7846
7847 result = PyUnicode_TranslateCharmap(self->str,
7848 self->length,
7849 newtable,
7850 "ignore");
7851 Py_DECREF(newtable);
7852 return result;
7853 err:
7854 Py_DECREF(newtable);
7855 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856}
7857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007858PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859"S.upper() -> unicode\n\
7860\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007861Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862
7863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007864unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 return fixup(self, fixupper);
7867}
7868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007869PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870"S.zfill(width) -> unicode\n\
7871\n\
7872Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007873of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874
7875static PyObject *
7876unicode_zfill(PyUnicodeObject *self, PyObject *args)
7877{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007878 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 PyUnicodeObject *u;
7880
Martin v. Löwis18e16552006-02-15 17:27:45 +00007881 Py_ssize_t width;
7882 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883 return NULL;
7884
7885 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007886 if (PyUnicode_CheckExact(self)) {
7887 Py_INCREF(self);
7888 return (PyObject*) self;
7889 }
7890 else
7891 return PyUnicode_FromUnicode(
7892 PyUnicode_AS_UNICODE(self),
7893 PyUnicode_GET_SIZE(self)
7894 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895 }
7896
7897 fill = width - self->length;
7898
7899 u = pad(self, fill, 0, '0');
7900
Walter Dörwald068325e2002-04-15 13:36:47 +00007901 if (u == NULL)
7902 return NULL;
7903
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904 if (u->str[fill] == '+' || u->str[fill] == '-') {
7905 /* move sign to beginning of string */
7906 u->str[0] = u->str[fill];
7907 u->str[fill] = '0';
7908 }
7909
7910 return (PyObject*) u;
7911}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912
7913#if 0
7914static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007915unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917 return PyInt_FromLong(unicode_freelist_size);
7918}
7919#endif
7920
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007921PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007922"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007924Return True if S starts with the specified prefix, False otherwise.\n\
7925With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007926With optional end, stop comparing S at that position.\n\
7927prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007928
7929static PyObject *
7930unicode_startswith(PyUnicodeObject *self,
7931 PyObject *args)
7932{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007933 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007934 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007935 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007936 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007937 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007939 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007940 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007941 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007942 if (PyTuple_Check(subobj)) {
7943 Py_ssize_t i;
7944 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7945 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7946 PyTuple_GET_ITEM(subobj, i));
7947 if (substring == NULL)
7948 return NULL;
7949 result = tailmatch(self, substring, start, end, -1);
7950 Py_DECREF(substring);
7951 if (result) {
7952 Py_RETURN_TRUE;
7953 }
7954 }
7955 /* nothing matched */
7956 Py_RETURN_FALSE;
7957 }
7958 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007960 return NULL;
7961 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007963 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964}
7965
7966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007967PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007968"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007970Return True if S ends with the specified suffix, False otherwise.\n\
7971With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007972With optional end, stop comparing S at that position.\n\
7973suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974
7975static PyObject *
7976unicode_endswith(PyUnicodeObject *self,
7977 PyObject *args)
7978{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007979 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007981 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007982 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007983 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007985 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7986 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007988 if (PyTuple_Check(subobj)) {
7989 Py_ssize_t i;
7990 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7991 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7992 PyTuple_GET_ITEM(subobj, i));
7993 if (substring == NULL)
7994 return NULL;
7995 result = tailmatch(self, substring, start, end, +1);
7996 Py_DECREF(substring);
7997 if (result) {
7998 Py_RETURN_TRUE;
7999 }
8000 }
8001 Py_RETURN_FALSE;
8002 }
8003 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008005 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008007 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008009 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010}
8011
Eric Smith8c663262007-08-25 02:26:07 +00008012#include "stringlib/string_format.h"
8013
8014PyDoc_STRVAR(format__doc__,
8015"S.format(*args, **kwargs) -> unicode\n\
8016\n\
8017");
8018
Eric Smith8c663262007-08-25 02:26:07 +00008019PyDoc_STRVAR(p_format__doc__,
8020"S.__format__(format_spec) -> unicode\n\
8021\n\
8022");
8023
8024static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008025unicode_getnewargs(PyUnicodeObject *v)
8026{
8027 return Py_BuildValue("(u#)", v->str, v->length);
8028}
8029
8030
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031static PyMethodDef unicode_methods[] = {
8032
8033 /* Order is according to common usage: often used methods should
8034 appear first, since lookup is done sequentially. */
8035
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008036 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8037 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8038 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008039 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008040 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8041 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8042 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8043 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8044 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8045 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8046 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008047 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008048 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8049 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8050 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008051 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008052 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8053 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8054 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008055 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008056 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008057 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008058 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008059 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8060 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8061 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8062 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8063 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8064 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8065 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8066 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8067 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8068 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8069 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8070 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8071 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8072 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008073 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008074 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008075 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8076 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008077 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8078 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00008079#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008080 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081#endif
8082
8083#if 0
8084 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008085 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086#endif
8087
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008088 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089 {NULL, NULL}
8090};
8091
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008092static PyObject *
8093unicode_mod(PyObject *v, PyObject *w)
8094{
8095 if (!PyUnicode_Check(v)) {
8096 Py_INCREF(Py_NotImplemented);
8097 return Py_NotImplemented;
8098 }
8099 return PyUnicode_Format(v, w);
8100}
8101
8102static PyNumberMethods unicode_as_number = {
8103 0, /*nb_add*/
8104 0, /*nb_subtract*/
8105 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008106 unicode_mod, /*nb_remainder*/
8107};
8108
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008110 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008111 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008112 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8113 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008114 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115 0, /* sq_ass_item */
8116 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008117 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118};
8119
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008120static PyObject*
8121unicode_subscript(PyUnicodeObject* self, PyObject* item)
8122{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008123 if (PyIndex_Check(item)) {
8124 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008125 if (i == -1 && PyErr_Occurred())
8126 return NULL;
8127 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008128 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008129 return unicode_getitem(self, i);
8130 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008131 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008132 Py_UNICODE* source_buf;
8133 Py_UNICODE* result_buf;
8134 PyObject* result;
8135
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008136 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008137 &start, &stop, &step, &slicelength) < 0) {
8138 return NULL;
8139 }
8140
8141 if (slicelength <= 0) {
8142 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008143 } else if (start == 0 && step == 1 && slicelength == self->length &&
8144 PyUnicode_CheckExact(self)) {
8145 Py_INCREF(self);
8146 return (PyObject *)self;
8147 } else if (step == 1) {
8148 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008149 } else {
8150 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008151 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8152 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008153
8154 if (result_buf == NULL)
8155 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008156
8157 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8158 result_buf[i] = source_buf[cur];
8159 }
Tim Petersced69f82003-09-16 20:30:58 +00008160
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008161 result = PyUnicode_FromUnicode(result_buf, slicelength);
8162 PyMem_FREE(result_buf);
8163 return result;
8164 }
8165 } else {
8166 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8167 return NULL;
8168 }
8169}
8170
8171static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008172 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008173 (binaryfunc)unicode_subscript, /* mp_subscript */
8174 (objobjargproc)0, /* mp_ass_subscript */
8175};
8176
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178/* Helpers for PyUnicode_Format() */
8179
8180static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008181getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008183 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184 if (argidx < arglen) {
8185 (*p_argidx)++;
8186 if (arglen < 0)
8187 return args;
8188 else
8189 return PyTuple_GetItem(args, argidx);
8190 }
8191 PyErr_SetString(PyExc_TypeError,
8192 "not enough arguments for format string");
8193 return NULL;
8194}
8195
Martin v. Löwis18e16552006-02-15 17:27:45 +00008196static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008197strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008199 register Py_ssize_t i;
8200 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 for (i = len - 1; i >= 0; i--)
8202 buffer[i] = (Py_UNICODE) charbuffer[i];
8203
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204 return len;
8205}
8206
Neal Norwitzfc76d632006-01-10 06:03:13 +00008207static int
8208doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8209{
Tim Peters15231542006-02-16 01:08:01 +00008210 Py_ssize_t result;
8211
Neal Norwitzfc76d632006-01-10 06:03:13 +00008212 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008213 result = strtounicode(buffer, (char *)buffer);
8214 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008215}
8216
8217static int
8218longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8219{
Tim Peters15231542006-02-16 01:08:01 +00008220 Py_ssize_t result;
8221
Neal Norwitzfc76d632006-01-10 06:03:13 +00008222 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008223 result = strtounicode(buffer, (char *)buffer);
8224 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008225}
8226
Guido van Rossum078151d2002-08-11 04:24:12 +00008227/* XXX To save some code duplication, formatfloat/long/int could have been
8228 shared with stringobject.c, converting from 8-bit to Unicode after the
8229 formatting is done. */
8230
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231static int
8232formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008233 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234 int flags,
8235 int prec,
8236 int type,
8237 PyObject *v)
8238{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008239 /* fmt = '%#.' + `prec` + `type`
8240 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 char fmt[20];
8242 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008243
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244 x = PyFloat_AsDouble(v);
8245 if (x == -1.0 && PyErr_Occurred())
8246 return -1;
8247 if (prec < 0)
8248 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8250 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008251 /* Worst case length calc to ensure no buffer overrun:
8252
8253 'g' formats:
8254 fmt = %#.<prec>g
8255 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8256 for any double rep.)
8257 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8258
8259 'f' formats:
8260 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8261 len = 1 + 50 + 1 + prec = 52 + prec
8262
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008263 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008264 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008265
8266 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008267 if (((type == 'g' || type == 'G') &&
8268 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008269 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008270 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008271 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008272 return -1;
8273 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008274 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8275 (flags&F_ALT) ? "#" : "",
8276 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008277 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278}
8279
Tim Peters38fd5b62000-09-21 05:43:11 +00008280static PyObject*
8281formatlong(PyObject *val, int flags, int prec, int type)
8282{
8283 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008284 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008285 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008286 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008287
8288 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8289 if (!str)
8290 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008291 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008292 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008293 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008294}
8295
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296static int
8297formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008298 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 int flags,
8300 int prec,
8301 int type,
8302 PyObject *v)
8303{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008304 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008305 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8306 * + 1 + 1
8307 * = 24
8308 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008309 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008310 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 long x;
8312
8313 x = PyInt_AsLong(v);
8314 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008315 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008316 if (x < 0 && type == 'u') {
8317 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008318 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008319 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8320 sign = "-";
8321 else
8322 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008324 prec = 1;
8325
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008326 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8327 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008328 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008329 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008330 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008331 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008332 return -1;
8333 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008334
8335 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008336 (type == 'x' || type == 'X' || type == 'o')) {
8337 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008338 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008339 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008340 * - when 0 is being converted, the C standard leaves off
8341 * the '0x' or '0X', which is inconsistent with other
8342 * %#x/%#X conversions and inconsistent with Python's
8343 * hex() function
8344 * - there are platforms that violate the standard and
8345 * convert 0 with the '0x' or '0X'
8346 * (Metrowerks, Compaq Tru64)
8347 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008348 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008349 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008350 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008351 * We can achieve the desired consistency by inserting our
8352 * own '0x' or '0X' prefix, and substituting %x/%X in place
8353 * of %#x/%#X.
8354 *
8355 * Note that this is the same approach as used in
8356 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008357 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008358 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8359 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008360 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008361 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008362 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8363 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008364 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008365 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008366 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008367 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008368 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008369 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370}
8371
8372static int
8373formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008374 size_t buflen,
8375 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008377 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008378 if (PyUnicode_Check(v)) {
8379 if (PyUnicode_GET_SIZE(v) != 1)
8380 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008382 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 else {
8384 /* Integer input truncated to a character */
8385 long x;
8386 x = PyInt_AsLong(v);
8387 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008388 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008389#ifdef Py_UNICODE_WIDE
8390 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008391 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008392 "%c arg not in range(0x110000) "
8393 "(wide Python build)");
8394 return -1;
8395 }
8396#else
8397 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008398 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008399 "%c arg not in range(0x10000) "
8400 "(narrow Python build)");
8401 return -1;
8402 }
8403#endif
8404 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 }
8406 buf[1] = '\0';
8407 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008408
8409 onError:
8410 PyErr_SetString(PyExc_TypeError,
8411 "%c requires int or char");
8412 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413}
8414
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008415/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8416
8417 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8418 chars are formatted. XXX This is a magic number. Each formatting
8419 routine does bounds checking to ensure no overflow, but a better
8420 solution may be to malloc a buffer of appropriate size for each
8421 format. For now, the current solution is sufficient.
8422*/
8423#define FORMATBUFLEN (size_t)120
8424
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425PyObject *PyUnicode_Format(PyObject *format,
8426 PyObject *args)
8427{
8428 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008429 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 int args_owned = 0;
8431 PyUnicodeObject *result = NULL;
8432 PyObject *dict = NULL;
8433 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008434
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 if (format == NULL || args == NULL) {
8436 PyErr_BadInternalCall();
8437 return NULL;
8438 }
8439 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008440 if (uformat == NULL)
8441 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442 fmt = PyUnicode_AS_UNICODE(uformat);
8443 fmtcnt = PyUnicode_GET_SIZE(uformat);
8444
8445 reslen = rescnt = fmtcnt + 100;
8446 result = _PyUnicode_New(reslen);
8447 if (result == NULL)
8448 goto onError;
8449 res = PyUnicode_AS_UNICODE(result);
8450
8451 if (PyTuple_Check(args)) {
8452 arglen = PyTuple_Size(args);
8453 argidx = 0;
8454 }
8455 else {
8456 arglen = -1;
8457 argidx = -2;
8458 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008459 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008460 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461 dict = args;
8462
8463 while (--fmtcnt >= 0) {
8464 if (*fmt != '%') {
8465 if (--rescnt < 0) {
8466 rescnt = fmtcnt + 100;
8467 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008468 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008469 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8471 --rescnt;
8472 }
8473 *res++ = *fmt++;
8474 }
8475 else {
8476 /* Got a format specifier */
8477 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008478 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480 Py_UNICODE c = '\0';
8481 Py_UNICODE fill;
8482 PyObject *v = NULL;
8483 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008484 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008486 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008487 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488
8489 fmt++;
8490 if (*fmt == '(') {
8491 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008492 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493 PyObject *key;
8494 int pcount = 1;
8495
8496 if (dict == NULL) {
8497 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008498 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499 goto onError;
8500 }
8501 ++fmt;
8502 --fmtcnt;
8503 keystart = fmt;
8504 /* Skip over balanced parentheses */
8505 while (pcount > 0 && --fmtcnt >= 0) {
8506 if (*fmt == ')')
8507 --pcount;
8508 else if (*fmt == '(')
8509 ++pcount;
8510 fmt++;
8511 }
8512 keylen = fmt - keystart - 1;
8513 if (fmtcnt < 0 || pcount > 0) {
8514 PyErr_SetString(PyExc_ValueError,
8515 "incomplete format key");
8516 goto onError;
8517 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008518#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008519 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 then looked up since Python uses strings to hold
8521 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008522 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523 key = PyUnicode_EncodeUTF8(keystart,
8524 keylen,
8525 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008526#else
8527 key = PyUnicode_FromUnicode(keystart, keylen);
8528#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529 if (key == NULL)
8530 goto onError;
8531 if (args_owned) {
8532 Py_DECREF(args);
8533 args_owned = 0;
8534 }
8535 args = PyObject_GetItem(dict, key);
8536 Py_DECREF(key);
8537 if (args == NULL) {
8538 goto onError;
8539 }
8540 args_owned = 1;
8541 arglen = -1;
8542 argidx = -2;
8543 }
8544 while (--fmtcnt >= 0) {
8545 switch (c = *fmt++) {
8546 case '-': flags |= F_LJUST; continue;
8547 case '+': flags |= F_SIGN; continue;
8548 case ' ': flags |= F_BLANK; continue;
8549 case '#': flags |= F_ALT; continue;
8550 case '0': flags |= F_ZERO; continue;
8551 }
8552 break;
8553 }
8554 if (c == '*') {
8555 v = getnextarg(args, arglen, &argidx);
8556 if (v == NULL)
8557 goto onError;
8558 if (!PyInt_Check(v)) {
8559 PyErr_SetString(PyExc_TypeError,
8560 "* wants int");
8561 goto onError;
8562 }
8563 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008564 if (width == -1 && PyErr_Occurred())
8565 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566 if (width < 0) {
8567 flags |= F_LJUST;
8568 width = -width;
8569 }
8570 if (--fmtcnt >= 0)
8571 c = *fmt++;
8572 }
8573 else if (c >= '0' && c <= '9') {
8574 width = c - '0';
8575 while (--fmtcnt >= 0) {
8576 c = *fmt++;
8577 if (c < '0' || c > '9')
8578 break;
8579 if ((width*10) / 10 != width) {
8580 PyErr_SetString(PyExc_ValueError,
8581 "width too big");
8582 goto onError;
8583 }
8584 width = width*10 + (c - '0');
8585 }
8586 }
8587 if (c == '.') {
8588 prec = 0;
8589 if (--fmtcnt >= 0)
8590 c = *fmt++;
8591 if (c == '*') {
8592 v = getnextarg(args, arglen, &argidx);
8593 if (v == NULL)
8594 goto onError;
8595 if (!PyInt_Check(v)) {
8596 PyErr_SetString(PyExc_TypeError,
8597 "* wants int");
8598 goto onError;
8599 }
8600 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008601 if (prec == -1 && PyErr_Occurred())
8602 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603 if (prec < 0)
8604 prec = 0;
8605 if (--fmtcnt >= 0)
8606 c = *fmt++;
8607 }
8608 else if (c >= '0' && c <= '9') {
8609 prec = c - '0';
8610 while (--fmtcnt >= 0) {
8611 c = Py_CHARMASK(*fmt++);
8612 if (c < '0' || c > '9')
8613 break;
8614 if ((prec*10) / 10 != prec) {
8615 PyErr_SetString(PyExc_ValueError,
8616 "prec too big");
8617 goto onError;
8618 }
8619 prec = prec*10 + (c - '0');
8620 }
8621 }
8622 } /* prec */
8623 if (fmtcnt >= 0) {
8624 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 if (--fmtcnt >= 0)
8626 c = *fmt++;
8627 }
8628 }
8629 if (fmtcnt < 0) {
8630 PyErr_SetString(PyExc_ValueError,
8631 "incomplete format");
8632 goto onError;
8633 }
8634 if (c != '%') {
8635 v = getnextarg(args, arglen, &argidx);
8636 if (v == NULL)
8637 goto onError;
8638 }
8639 sign = 0;
8640 fill = ' ';
8641 switch (c) {
8642
8643 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008644 pbuf = formatbuf;
8645 /* presume that buffer length is at least 1 */
8646 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647 len = 1;
8648 break;
8649
8650 case 's':
8651 case 'r':
8652 if (PyUnicode_Check(v) && c == 's') {
8653 temp = v;
8654 Py_INCREF(temp);
8655 }
8656 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00008658 temp = PyObject_Str(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 else
8660 temp = PyObject_Repr(v);
8661 if (temp == NULL)
8662 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008663 if (PyUnicode_Check(temp))
8664 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008665 else {
8666 Py_DECREF(temp);
8667 PyErr_SetString(PyExc_TypeError,
8668 "%s argument has non-string str()");
8669 goto onError;
8670 }
8671 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008672 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 len = PyUnicode_GET_SIZE(temp);
8674 if (prec >= 0 && len > prec)
8675 len = prec;
8676 break;
8677
8678 case 'i':
8679 case 'd':
8680 case 'u':
8681 case 'o':
8682 case 'x':
8683 case 'X':
8684 if (c == 'i')
8685 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008686 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008687 temp = formatlong(v, flags, prec, c);
8688 if (!temp)
8689 goto onError;
8690 pbuf = PyUnicode_AS_UNICODE(temp);
8691 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008692 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008694 else {
8695 pbuf = formatbuf;
8696 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8697 flags, prec, c, v);
8698 if (len < 0)
8699 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008700 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008701 }
8702 if (flags & F_ZERO)
8703 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 break;
8705
8706 case 'e':
8707 case 'E':
8708 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008709 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710 case 'g':
8711 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008712 if (c == 'F')
8713 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008714 pbuf = formatbuf;
8715 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8716 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 if (len < 0)
8718 goto onError;
8719 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008720 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 fill = '0';
8722 break;
8723
8724 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008725 pbuf = formatbuf;
8726 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 if (len < 0)
8728 goto onError;
8729 break;
8730
8731 default:
8732 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008733 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008734 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008735 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008736 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008737 (Py_ssize_t)(fmt - 1 -
8738 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739 goto onError;
8740 }
8741 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008742 if (*pbuf == '-' || *pbuf == '+') {
8743 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744 len--;
8745 }
8746 else if (flags & F_SIGN)
8747 sign = '+';
8748 else if (flags & F_BLANK)
8749 sign = ' ';
8750 else
8751 sign = 0;
8752 }
8753 if (width < len)
8754 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008755 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756 reslen -= rescnt;
8757 rescnt = width + fmtcnt + 100;
8758 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008759 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008760 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008761 PyErr_NoMemory();
8762 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008763 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008764 if (_PyUnicode_Resize(&result, reslen) < 0) {
8765 Py_XDECREF(temp);
8766 goto onError;
8767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 res = PyUnicode_AS_UNICODE(result)
8769 + reslen - rescnt;
8770 }
8771 if (sign) {
8772 if (fill != ' ')
8773 *res++ = sign;
8774 rescnt--;
8775 if (width > len)
8776 width--;
8777 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008778 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008779 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008780 assert(pbuf[1] == c);
8781 if (fill != ' ') {
8782 *res++ = *pbuf++;
8783 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008784 }
Tim Petersfff53252001-04-12 18:38:48 +00008785 rescnt -= 2;
8786 width -= 2;
8787 if (width < 0)
8788 width = 0;
8789 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 if (width > len && !(flags & F_LJUST)) {
8792 do {
8793 --rescnt;
8794 *res++ = fill;
8795 } while (--width > len);
8796 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008797 if (fill == ' ') {
8798 if (sign)
8799 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008800 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008801 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008802 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008803 *res++ = *pbuf++;
8804 *res++ = *pbuf++;
8805 }
8806 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008807 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808 res += len;
8809 rescnt -= len;
8810 while (--width >= len) {
8811 --rescnt;
8812 *res++ = ' ';
8813 }
8814 if (dict && (argidx < arglen) && c != '%') {
8815 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008816 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008817 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818 goto onError;
8819 }
8820 Py_XDECREF(temp);
8821 } /* '%' */
8822 } /* until end */
8823 if (argidx < arglen && !dict) {
8824 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008825 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826 goto onError;
8827 }
8828
Thomas Woutersa96affe2006-03-12 00:29:36 +00008829 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8830 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 if (args_owned) {
8832 Py_DECREF(args);
8833 }
8834 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835 return (PyObject *)result;
8836
8837 onError:
8838 Py_XDECREF(result);
8839 Py_DECREF(uformat);
8840 if (args_owned) {
8841 Py_DECREF(args);
8842 }
8843 return NULL;
8844}
8845
Jeremy Hylton938ace62002-07-17 16:30:39 +00008846static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008847unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8848
Tim Peters6d6c1a32001-08-02 04:15:00 +00008849static PyObject *
8850unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8851{
8852 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008853 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008854 char *encoding = NULL;
8855 char *errors = NULL;
8856
Guido van Rossume023fe02001-08-30 03:12:59 +00008857 if (type != &PyUnicode_Type)
8858 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008859 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8860 kwlist, &x, &encoding, &errors))
8861 return NULL;
8862 if (x == NULL)
8863 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008864 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00008865 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008866 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008867 return PyUnicode_FromEncodedObject(x, encoding, errors);
8868}
8869
Guido van Rossume023fe02001-08-30 03:12:59 +00008870static PyObject *
8871unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8872{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008873 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008874 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008875
8876 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8877 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8878 if (tmp == NULL)
8879 return NULL;
8880 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008881 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008882 if (pnew == NULL) {
8883 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008884 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008885 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008886 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8887 if (pnew->str == NULL) {
8888 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008889 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008890 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008891 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008892 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008893 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8894 pnew->length = n;
8895 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008896 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008897 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008898}
8899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008900PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008901"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008902\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008903Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008904encoding defaults to the current default string encoding.\n\
8905errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008906
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008907static PyObject *unicode_iter(PyObject *seq);
8908
Guido van Rossumd57fd912000-03-10 22:53:23 +00008909PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008910 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008911 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912 sizeof(PyUnicodeObject), /* tp_size */
8913 0, /* tp_itemsize */
8914 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008915 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008917 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008919 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008920 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008921 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008923 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924 (hashfunc) unicode_hash, /* tp_hash*/
8925 0, /* tp_call*/
8926 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008927 PyObject_GenericGetAttr, /* tp_getattro */
8928 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00008929 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008930 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8931 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008932 unicode_doc, /* tp_doc */
8933 0, /* tp_traverse */
8934 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008935 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008936 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008937 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008938 0, /* tp_iternext */
8939 unicode_methods, /* tp_methods */
8940 0, /* tp_members */
8941 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00008942 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008943 0, /* tp_dict */
8944 0, /* tp_descr_get */
8945 0, /* tp_descr_set */
8946 0, /* tp_dictoffset */
8947 0, /* tp_init */
8948 0, /* tp_alloc */
8949 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008950 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951};
8952
8953/* Initialize the Unicode implementation */
8954
Thomas Wouters78890102000-07-22 19:25:51 +00008955void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008957 int i;
8958
Thomas Wouters477c8d52006-05-27 19:21:47 +00008959 /* XXX - move this array to unicodectype.c ? */
8960 Py_UNICODE linebreak[] = {
8961 0x000A, /* LINE FEED */
8962 0x000D, /* CARRIAGE RETURN */
8963 0x001C, /* FILE SEPARATOR */
8964 0x001D, /* GROUP SEPARATOR */
8965 0x001E, /* RECORD SEPARATOR */
8966 0x0085, /* NEXT LINE */
8967 0x2028, /* LINE SEPARATOR */
8968 0x2029, /* PARAGRAPH SEPARATOR */
8969 };
8970
Fred Drakee4315f52000-05-09 19:53:39 +00008971 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008972 unicode_freelist = NULL;
8973 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008975 if (!unicode_empty)
8976 return;
8977
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008978 for (i = 0; i < 256; i++)
8979 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008980 if (PyType_Ready(&PyUnicode_Type) < 0)
8981 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008982
8983 /* initialize the linebreak bloom filter */
8984 bloom_linebreak = make_bloom_mask(
8985 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8986 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008987
8988 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989}
8990
8991/* Finalize the Unicode implementation */
8992
8993void
Thomas Wouters78890102000-07-22 19:25:51 +00008994_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008996 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008997 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008999 Py_XDECREF(unicode_empty);
9000 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009001
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009002 for (i = 0; i < 256; i++) {
9003 if (unicode_latin1[i]) {
9004 Py_DECREF(unicode_latin1[i]);
9005 unicode_latin1[i] = NULL;
9006 }
9007 }
9008
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009009 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010 PyUnicodeObject *v = u;
9011 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00009012 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00009013 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00009014 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009015 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009017 unicode_freelist = NULL;
9018 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009020
Walter Dörwald16807132007-05-25 13:52:07 +00009021void
9022PyUnicode_InternInPlace(PyObject **p)
9023{
9024 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9025 PyObject *t;
9026 if (s == NULL || !PyUnicode_Check(s))
9027 Py_FatalError(
9028 "PyUnicode_InternInPlace: unicode strings only please!");
9029 /* If it's a subclass, we don't really know what putting
9030 it in the interned dict might do. */
9031 if (!PyUnicode_CheckExact(s))
9032 return;
9033 if (PyUnicode_CHECK_INTERNED(s))
9034 return;
9035 if (interned == NULL) {
9036 interned = PyDict_New();
9037 if (interned == NULL) {
9038 PyErr_Clear(); /* Don't leave an exception */
9039 return;
9040 }
9041 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009042 /* It might be that the GetItem call fails even
9043 though the key is present in the dictionary,
9044 namely when this happens during a stack overflow. */
9045 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009046 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009047 Py_END_ALLOW_RECURSION
9048
Walter Dörwald16807132007-05-25 13:52:07 +00009049 if (t) {
9050 Py_INCREF(t);
9051 Py_DECREF(*p);
9052 *p = t;
9053 return;
9054 }
9055
Martin v. Löwis5b222132007-06-10 09:51:05 +00009056 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009057 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9058 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009059 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009060 return;
9061 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009062 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009063 /* The two references in interned are not counted by refcnt.
9064 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009065 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009066 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9067}
9068
9069void
9070PyUnicode_InternImmortal(PyObject **p)
9071{
9072 PyUnicode_InternInPlace(p);
9073 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9074 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9075 Py_INCREF(*p);
9076 }
9077}
9078
9079PyObject *
9080PyUnicode_InternFromString(const char *cp)
9081{
9082 PyObject *s = PyUnicode_FromString(cp);
9083 if (s == NULL)
9084 return NULL;
9085 PyUnicode_InternInPlace(&s);
9086 return s;
9087}
9088
9089void _Py_ReleaseInternedUnicodeStrings(void)
9090{
9091 PyObject *keys;
9092 PyUnicodeObject *s;
9093 Py_ssize_t i, n;
9094 Py_ssize_t immortal_size = 0, mortal_size = 0;
9095
9096 if (interned == NULL || !PyDict_Check(interned))
9097 return;
9098 keys = PyDict_Keys(interned);
9099 if (keys == NULL || !PyList_Check(keys)) {
9100 PyErr_Clear();
9101 return;
9102 }
9103
9104 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9105 detector, interned unicode strings are not forcibly deallocated;
9106 rather, we give them their stolen references back, and then clear
9107 and DECREF the interned dict. */
9108
9109 n = PyList_GET_SIZE(keys);
9110 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9111 n);
9112 for (i = 0; i < n; i++) {
9113 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9114 switch (s->state) {
9115 case SSTATE_NOT_INTERNED:
9116 /* XXX Shouldn't happen */
9117 break;
9118 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009119 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009120 immortal_size += s->length;
9121 break;
9122 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009123 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009124 mortal_size += s->length;
9125 break;
9126 default:
9127 Py_FatalError("Inconsistent interned string state.");
9128 }
9129 s->state = SSTATE_NOT_INTERNED;
9130 }
9131 fprintf(stderr, "total size of all interned strings: "
9132 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9133 "mortal/immortal\n", mortal_size, immortal_size);
9134 Py_DECREF(keys);
9135 PyDict_Clear(interned);
9136 Py_DECREF(interned);
9137 interned = NULL;
9138}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009139
9140
9141/********************* Unicode Iterator **************************/
9142
9143typedef struct {
9144 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009145 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009146 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9147} unicodeiterobject;
9148
9149static void
9150unicodeiter_dealloc(unicodeiterobject *it)
9151{
9152 _PyObject_GC_UNTRACK(it);
9153 Py_XDECREF(it->it_seq);
9154 PyObject_GC_Del(it);
9155}
9156
9157static int
9158unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9159{
9160 Py_VISIT(it->it_seq);
9161 return 0;
9162}
9163
9164static PyObject *
9165unicodeiter_next(unicodeiterobject *it)
9166{
9167 PyUnicodeObject *seq;
9168 PyObject *item;
9169
9170 assert(it != NULL);
9171 seq = it->it_seq;
9172 if (seq == NULL)
9173 return NULL;
9174 assert(PyUnicode_Check(seq));
9175
9176 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009177 item = PyUnicode_FromUnicode(
9178 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009179 if (item != NULL)
9180 ++it->it_index;
9181 return item;
9182 }
9183
9184 Py_DECREF(seq);
9185 it->it_seq = NULL;
9186 return NULL;
9187}
9188
9189static PyObject *
9190unicodeiter_len(unicodeiterobject *it)
9191{
9192 Py_ssize_t len = 0;
9193 if (it->it_seq)
9194 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9195 return PyInt_FromSsize_t(len);
9196}
9197
9198PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9199
9200static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009201 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9202 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009203 {NULL, NULL} /* sentinel */
9204};
9205
9206PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009207 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum254348e2007-11-21 19:29:53 +00009208 "unicode_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009209 sizeof(unicodeiterobject), /* tp_basicsize */
9210 0, /* tp_itemsize */
9211 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009212 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009213 0, /* tp_print */
9214 0, /* tp_getattr */
9215 0, /* tp_setattr */
9216 0, /* tp_compare */
9217 0, /* tp_repr */
9218 0, /* tp_as_number */
9219 0, /* tp_as_sequence */
9220 0, /* tp_as_mapping */
9221 0, /* tp_hash */
9222 0, /* tp_call */
9223 0, /* tp_str */
9224 PyObject_GenericGetAttr, /* tp_getattro */
9225 0, /* tp_setattro */
9226 0, /* tp_as_buffer */
9227 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9228 0, /* tp_doc */
9229 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9230 0, /* tp_clear */
9231 0, /* tp_richcompare */
9232 0, /* tp_weaklistoffset */
9233 PyObject_SelfIter, /* tp_iter */
9234 (iternextfunc)unicodeiter_next, /* tp_iternext */
9235 unicodeiter_methods, /* tp_methods */
9236 0,
9237};
9238
9239static PyObject *
9240unicode_iter(PyObject *seq)
9241{
9242 unicodeiterobject *it;
9243
9244 if (!PyUnicode_Check(seq)) {
9245 PyErr_BadInternalCall();
9246 return NULL;
9247 }
9248 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9249 if (it == NULL)
9250 return NULL;
9251 it->it_index = 0;
9252 Py_INCREF(seq);
9253 it->it_seq = (PyUnicodeObject *)seq;
9254 _PyObject_GC_TRACK(it);
9255 return (PyObject *)it;
9256}
9257
Martin v. Löwis5b222132007-06-10 09:51:05 +00009258size_t
9259Py_UNICODE_strlen(const Py_UNICODE *u)
9260{
9261 int res = 0;
9262 while(*u++)
9263 res++;
9264 return res;
9265}
9266
9267Py_UNICODE*
9268Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9269{
9270 Py_UNICODE *u = s1;
9271 while ((*u++ = *s2++));
9272 return s1;
9273}
9274
9275Py_UNICODE*
9276Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9277{
9278 Py_UNICODE *u = s1;
9279 while ((*u++ = *s2++))
9280 if (n-- == 0)
9281 break;
9282 return s1;
9283}
9284
9285int
9286Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9287{
9288 while (*s1 && *s2 && *s1 == *s2)
9289 s1++, s2++;
9290 if (*s1 && *s2)
9291 return (*s1 < *s2) ? -1 : +1;
9292 if (*s1)
9293 return 1;
9294 if (*s2)
9295 return -1;
9296 return 0;
9297}
9298
9299Py_UNICODE*
9300Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9301{
9302 const Py_UNICODE *p;
9303 for (p = s; *p; p++)
9304 if (*p == c)
9305 return (Py_UNICODE*)p;
9306 return NULL;
9307}
9308
9309
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009310#ifdef __cplusplus
9311}
9312#endif
9313
9314
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009315/*
9316Local variables:
9317c-basic-offset: 4
9318indent-tabs-mode: nil
9319End:
9320*/