blob: 426dc079ca3df33417ea95a40de59397b74b41d2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Eric Smith8c663262007-08-25 02:26:07 +000049#include "formatter_unicode.h"
50
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000051#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000052#include <windows.h>
53#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000054
Guido van Rossumd57fd912000-03-10 22:53:23 +000055/* Limit for the Unicode object free list */
56
57#define MAX_UNICODE_FREELIST_SIZE 1024
58
59/* Limit for the Unicode object free list stay alive optimization.
60
61 The implementation will keep allocated Unicode memory intact for
62 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000063 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Barry Warsaw51ac5802000-03-20 16:36:48 +000065 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000067 malloc()-overhead) bytes of unused garbage.
68
69 Setting the limit to 0 effectively turns the feature off.
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071 Note: This is an experimental feature ! If you get core dumps when
72 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000073
74*/
75
Guido van Rossumfd4b9572000-04-10 13:51:10 +000076#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000077
78/* Endianness switches; defaults to little endian */
79
80#ifdef WORDS_BIGENDIAN
81# define BYTEORDER_IS_BIG_ENDIAN
82#else
83# define BYTEORDER_IS_LITTLE_ENDIAN
84#endif
85
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086/* --- Globals ------------------------------------------------------------
87
88 The globals are initialized by the _PyUnicode_Init() API and should
89 not be used before calling that API.
90
91*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093
94#ifdef __cplusplus
95extern "C" {
96#endif
97
Walter Dörwald16807132007-05-25 13:52:07 +000098/* This dictionary holds all interned unicode strings. Note that references
99 to strings in this dictionary are *not* counted in the string's ob_refcnt.
100 When the interned string reaches a refcnt of 0 the string deallocation
101 function will delete the reference from this dictionary.
102
103 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000104 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000105*/
106static PyObject *interned;
107
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000109static PyUnicodeObject *unicode_freelist;
110static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000112/* The empty Unicode object is shared to improve performance. */
113static PyUnicodeObject *unicode_empty;
114
115/* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117static PyUnicodeObject *unicode_latin1[256];
118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000120 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000121 PyUnicode_GetDefaultEncoding() API to access this global.
122
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000123 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000124 hard coded default!
125*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000126static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000127
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000128Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000129PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000130{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000131#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000132 return 0x10FFFF;
133#else
134 /* This is actually an illegal character, so it should
135 not be passed to unichr. */
136 return 0xFFFF;
137#endif
138}
139
Thomas Wouters477c8d52006-05-27 19:21:47 +0000140/* --- Bloom Filters ----------------------------------------------------- */
141
142/* stuff to implement simple "bloom filters" for Unicode characters.
143 to keep things simple, we use a single bitmask, using the least 5
144 bits from each unicode characters as the bit index. */
145
146/* the linebreak mask is set up by Unicode_Init below */
147
148#define BLOOM_MASK unsigned long
149
150static BLOOM_MASK bloom_linebreak;
151
152#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
153
154#define BLOOM_LINEBREAK(ch)\
155 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
156
157Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
158{
159 /* calculate simple bloom-style bitmask for a given unicode string */
160
161 long mask;
162 Py_ssize_t i;
163
164 mask = 0;
165 for (i = 0; i < len; i++)
166 mask |= (1 << (ptr[i] & 0x1F));
167
168 return mask;
169}
170
171Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
172{
173 Py_ssize_t i;
174
175 for (i = 0; i < setlen; i++)
176 if (set[i] == chr)
177 return 1;
178
179 return 0;
180}
181
182#define BLOOM_MEMBER(mask, chr, set, setlen)\
183 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
184
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185/* --- Unicode Object ----------------------------------------------------- */
186
187static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000189 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190{
191 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000192
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000193 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 /* Resizing shared object (unicode_empty or single character
198 objects) in-place is not allowed. Use PyUnicode_Resize()
199 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 if (unicode == unicode_empty ||
202 (unicode->length == 1 &&
203 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000206 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 return -1;
208 }
209
Thomas Wouters477c8d52006-05-27 19:21:47 +0000210 /* We allocate one more byte to make sure the string is Ux0000 terminated.
211 The overallocation is also used by fastsearch, which assumes that it's
212 safe to look at str[length] (without making any assumptions about what
213 it contains). */
214
Guido van Rossumd57fd912000-03-10 22:53:23 +0000215 oldstr = unicode->str;
216 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
217 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 PyErr_NoMemory();
220 return -1;
221 }
222 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000223 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000225 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000227 if (unicode->defenc) {
228 Py_DECREF(unicode->defenc);
229 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 }
231 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000232
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 return 0;
234}
235
236/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000237 Ux0000 terminated; some code (e.g. new_identifier)
238 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239
240 XXX This allocator could further be enhanced by assuring that the
241 free list never reduces its size below 1.
242
243*/
244
245static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 register PyUnicodeObject *unicode;
249
Thomas Wouters477c8d52006-05-27 19:21:47 +0000250 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (length == 0 && unicode_empty != NULL) {
252 Py_INCREF(unicode_empty);
253 return unicode_empty;
254 }
255
256 /* Unicode freelist & memory allocation */
257 if (unicode_freelist) {
258 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000259 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Keep-Alive optimization: we only upsize the buffer,
263 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000264 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000265 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000266 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 }
269 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000270 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000272 }
273 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 }
275 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000276 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (unicode == NULL)
278 return NULL;
279 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
280 }
281
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000282 if (!unicode->str) {
283 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000284 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000285 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000286 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000287 * the caller fails before initializing str -- unicode_resize()
288 * reads str[0], and the Keep-Alive optimization can keep memory
289 * allocated for str alive across a call to unicode_dealloc(unicode).
290 * We don't want unicode_resize to read uninitialized memory in
291 * that case.
292 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000293 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000297 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000298 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000300
301 onError:
302 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000303 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305}
306
307static
Guido van Rossum9475a232001-10-05 20:51:39 +0000308void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309{
Walter Dörwald16807132007-05-25 13:52:07 +0000310 switch (PyUnicode_CHECK_INTERNED(unicode)) {
311 case SSTATE_NOT_INTERNED:
312 break;
313
314 case SSTATE_INTERNED_MORTAL:
315 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000316 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000317 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
318 Py_FatalError(
319 "deletion of interned unicode string failed");
320 break;
321
322 case SSTATE_INTERNED_IMMORTAL:
323 Py_FatalError("Immortal interned unicode string died.");
324
325 default:
326 Py_FatalError("Inconsistent interned unicode string state.");
327 }
328
Guido van Rossum604ddf82001-12-06 20:03:56 +0000329 if (PyUnicode_CheckExact(unicode) &&
330 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000331 /* Keep-Alive optimization */
332 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000333 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 unicode->str = NULL;
335 unicode->length = 0;
336 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000337 if (unicode->defenc) {
338 Py_DECREF(unicode->defenc);
339 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000340 }
341 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 *(PyUnicodeObject **)unicode = unicode_freelist;
343 unicode_freelist = unicode;
344 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000347 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000348 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000349 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 }
351}
352
Martin v. Löwis18e16552006-02-15 17:27:45 +0000353int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000354{
355 register PyUnicodeObject *v;
356
357 /* Argument checks */
358 if (unicode == NULL) {
359 PyErr_BadInternalCall();
360 return -1;
361 }
362 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000363 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 PyErr_BadInternalCall();
365 return -1;
366 }
367
368 /* Resizing unicode_empty and single character objects is not
369 possible since these are being shared. We simply return a fresh
370 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000371 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000372 (v == unicode_empty || v->length == 1)) {
373 PyUnicodeObject *w = _PyUnicode_New(length);
374 if (w == NULL)
375 return -1;
376 Py_UNICODE_COPY(w->str, v->str,
377 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000378 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 *unicode = (PyObject *)w;
380 return 0;
381 }
382
383 /* Note that we don't have to modify *unicode for unshared Unicode
384 objects, since we can modify them in-place. */
385 return unicode_resize(v, length);
386}
387
388/* Internal API for use in unicodeobject.c only ! */
389#define _PyUnicode_Resize(unicodevar, length) \
390 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
391
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000393 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394{
395 PyUnicodeObject *unicode;
396
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000397 /* If the Unicode data is known at construction time, we can apply
398 some optimizations which share commonly used objects. */
399 if (u != NULL) {
400
401 /* Optimization for empty strings */
402 if (size == 0 && unicode_empty != NULL) {
403 Py_INCREF(unicode_empty);
404 return (PyObject *)unicode_empty;
405 }
406
407 /* Single character Unicode objects in the Latin-1 range are
408 shared when using this constructor */
409 if (size == 1 && *u < 256) {
410 unicode = unicode_latin1[*u];
411 if (!unicode) {
412 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000413 if (!unicode)
414 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000415 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000416 unicode_latin1[*u] = unicode;
417 }
418 Py_INCREF(unicode);
419 return (PyObject *)unicode;
420 }
421 }
Tim Petersced69f82003-09-16 20:30:58 +0000422
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 unicode = _PyUnicode_New(size);
424 if (!unicode)
425 return NULL;
426
427 /* Copy the Unicode data into the new object */
428 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430
431 return (PyObject *)unicode;
432}
433
Walter Dörwaldd2034312007-05-18 16:29:38 +0000434PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000435{
436 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000437 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000438 some optimizations which share commonly used objects.
439 Also, this means the input must be UTF-8, so fall back to the
440 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000441 if (u != NULL) {
442
443 /* Optimization for empty strings */
444 if (size == 0 && unicode_empty != NULL) {
445 Py_INCREF(unicode_empty);
446 return (PyObject *)unicode_empty;
447 }
448
Martin v. Löwis9c121062007-08-05 20:26:11 +0000449 /* Single characters are shared when using this constructor.
450 Restrict to ASCII, since the input must be UTF-8. */
451 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000452 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000453 if (!unicode) {
454 unicode = _PyUnicode_New(1);
455 if (!unicode)
456 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000457 unicode->str[0] = Py_CHARMASK(*u);
458 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000459 }
460 Py_INCREF(unicode);
461 return (PyObject *)unicode;
462 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000463
464 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000465 }
466
Walter Dörwald55507312007-05-18 13:12:10 +0000467 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000468 if (!unicode)
469 return NULL;
470
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000471 return (PyObject *)unicode;
472}
473
Walter Dörwaldd2034312007-05-18 16:29:38 +0000474PyObject *PyUnicode_FromString(const char *u)
475{
476 size_t size = strlen(u);
477 if (size > PY_SSIZE_T_MAX) {
478 PyErr_SetString(PyExc_OverflowError, "input too long");
479 return NULL;
480 }
481
482 return PyUnicode_FromStringAndSize(u, size);
483}
484
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485#ifdef HAVE_WCHAR_H
486
487PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000488 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489{
490 PyUnicodeObject *unicode;
491
492 if (w == NULL) {
493 PyErr_BadInternalCall();
494 return NULL;
495 }
496
497 unicode = _PyUnicode_New(size);
498 if (!unicode)
499 return NULL;
500
501 /* Copy the wchar_t data into the new object */
502#ifdef HAVE_USABLE_WCHAR_T
503 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000504#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 {
506 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000507 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000509 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510 *u++ = *w++;
511 }
512#endif
513
514 return (PyObject *)unicode;
515}
516
Walter Dörwald346737f2007-05-31 10:44:43 +0000517static void
518makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
519{
520 *fmt++ = '%';
521 if (width) {
522 if (zeropad)
523 *fmt++ = '0';
524 fmt += sprintf(fmt, "%d", width);
525 }
526 if (precision)
527 fmt += sprintf(fmt, ".%d", precision);
528 if (longflag)
529 *fmt++ = 'l';
530 else if (size_tflag) {
531 char *f = PY_FORMAT_SIZE_T;
532 while (*f)
533 *fmt++ = *f++;
534 }
535 *fmt++ = c;
536 *fmt = '\0';
537}
538
Walter Dörwaldd2034312007-05-18 16:29:38 +0000539#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
540
541PyObject *
542PyUnicode_FromFormatV(const char *format, va_list vargs)
543{
544 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000545 Py_ssize_t callcount = 0;
546 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000547 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000548 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000549 int width = 0;
550 int precision = 0;
551 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000552 const char* f;
553 Py_UNICODE *s;
554 PyObject *string;
555 /* used by sprintf */
556 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000557 /* use abuffer instead of buffer, if we need more space
558 * (which can happen if there's a format specifier with width). */
559 char *abuffer = NULL;
560 char *realbuffer;
561 Py_ssize_t abuffersize = 0;
562 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000563 const char *copy;
564
565#ifdef VA_LIST_IS_ARRAY
566 Py_MEMCPY(count, vargs, sizeof(va_list));
567#else
568#ifdef __va_copy
569 __va_copy(count, vargs);
570#else
571 count = vargs;
572#endif
573#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 /* step 1: count the number of %S/%R format specifications
Thomas Heller519a0422007-11-15 20:48:54 +0000575 * (we call PyObject_Str()/PyObject_Repr() for these objects
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000576 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000577 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000578 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000579 ++callcount;
580 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000581 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000582 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000583 if (callcount) {
584 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
585 if (!callresults) {
586 PyErr_NoMemory();
587 return NULL;
588 }
589 callresult = callresults;
590 }
591 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000592 for (f = format; *f; f++) {
593 if (*f == '%') {
594 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000595 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000596 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000597 width = (width*10) + *f++ - '0';
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000598 while (*++f && *f != '%' && !ISALPHA(*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000599 ;
600
601 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
602 * they don't affect the amount of space we reserve.
603 */
604 if ((*f == 'l' || *f == 'z') &&
605 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000606 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000607
608 switch (*f) {
609 case 'c':
610 (void)va_arg(count, int);
611 /* fall through... */
612 case '%':
613 n++;
614 break;
615 case 'd': case 'u': case 'i': case 'x':
616 (void) va_arg(count, int);
617 /* 20 bytes is enough to hold a 64-bit
618 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000619 This isn't enough for octal.
620 If a width is specified we need more
621 (which we allocate later). */
622 if (width < 20)
623 width = 20;
624 n += width;
625 if (abuffersize < width)
626 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000627 break;
628 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000629 {
630 /* UTF-8 */
631 unsigned char*s;
632 s = va_arg(count, unsigned char*);
633 while (*s) {
634 if (*s < 128) {
635 n++; s++;
636 } else if (*s < 0xc0) {
637 /* invalid UTF-8 */
638 n++; s++;
639 } else if (*s < 0xc0) {
640 n++;
641 s++; if(!*s)break;
642 s++;
643 } else if (*s < 0xe0) {
644 n++;
645 s++; if(!*s)break;
646 s++; if(!*s)break;
647 s++;
648 } else {
649 #ifdef Py_UNICODE_WIDE
650 n++;
651 #else
652 n+=2;
653 #endif
654 s++; if(!*s)break;
655 s++; if(!*s)break;
656 s++; if(!*s)break;
657 s++;
658 }
659 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000660 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000661 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000662 case 'U':
663 {
664 PyObject *obj = va_arg(count, PyObject *);
665 assert(obj && PyUnicode_Check(obj));
666 n += PyUnicode_GET_SIZE(obj);
667 break;
668 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000669 case 'V':
670 {
671 PyObject *obj = va_arg(count, PyObject *);
672 const char *str = va_arg(count, const char *);
673 assert(obj || str);
674 assert(!obj || PyUnicode_Check(obj));
675 if (obj)
676 n += PyUnicode_GET_SIZE(obj);
677 else
678 n += strlen(str);
679 break;
680 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000681 case 'S':
682 {
683 PyObject *obj = va_arg(count, PyObject *);
684 PyObject *str;
685 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000686 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000687 if (!str)
688 goto fail;
689 n += PyUnicode_GET_SIZE(str);
690 /* Remember the str and switch to the next slot */
691 *callresult++ = str;
692 break;
693 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000694 case 'R':
695 {
696 PyObject *obj = va_arg(count, PyObject *);
697 PyObject *repr;
698 assert(obj);
699 repr = PyObject_Repr(obj);
700 if (!repr)
701 goto fail;
702 n += PyUnicode_GET_SIZE(repr);
703 /* Remember the repr and switch to the next slot */
704 *callresult++ = repr;
705 break;
706 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000707 case 'p':
708 (void) va_arg(count, int);
709 /* maximum 64-bit pointer representation:
710 * 0xffffffffffffffff
711 * so 19 characters is enough.
712 * XXX I count 18 -- what's the extra for?
713 */
714 n += 19;
715 break;
716 default:
717 /* if we stumble upon an unknown
718 formatting code, copy the rest of
719 the format string to the output
720 string. (we cannot just skip the
721 code, since there's no way to know
722 what's in the argument list) */
723 n += strlen(p);
724 goto expand;
725 }
726 } else
727 n++;
728 }
729 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000730 if (abuffersize > 20) {
731 abuffer = PyMem_Malloc(abuffersize);
732 if (!abuffer) {
733 PyErr_NoMemory();
734 goto fail;
735 }
736 realbuffer = abuffer;
737 }
738 else
739 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000740 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000741 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000742 we don't have to resize the string.
743 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 string = PyUnicode_FromUnicode(NULL, n);
745 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000746 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000747
748 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000749 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750
751 for (f = format; *f; f++) {
752 if (*f == '%') {
753 const char* p = f++;
754 int longflag = 0;
755 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000756 zeropad = (*f == '0');
757 /* parse the width.precision part */
758 width = 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000759 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 width = (width*10) + *f++ - '0';
761 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 if (*f == '.') {
763 f++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +0000764 while (ISDIGIT(*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000766 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767 /* handle the long flag, but only for %ld and %lu.
768 others can be added when necessary. */
769 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
770 longflag = 1;
771 ++f;
772 }
773 /* handle the size_t flag. */
774 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
775 size_tflag = 1;
776 ++f;
777 }
778
779 switch (*f) {
780 case 'c':
781 *s++ = va_arg(vargs, int);
782 break;
783 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000784 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000785 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000786 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000787 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000788 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000789 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000790 sprintf(realbuffer, fmt, va_arg(vargs, int));
791 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000792 break;
793 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000794 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000795 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000796 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000798 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000799 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000800 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
801 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000802 break;
803 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000804 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
805 sprintf(realbuffer, fmt, va_arg(vargs, int));
806 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000807 break;
808 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000809 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
810 sprintf(realbuffer, fmt, va_arg(vargs, int));
811 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000812 break;
813 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000814 {
815 /* Parameter must be UTF-8 encoded.
816 In case of encoding errors, use
817 the replacement character. */
818 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000819 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000820 u = PyUnicode_DecodeUTF8(p, strlen(p),
821 "replace");
822 if (!u)
823 goto fail;
824 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
825 PyUnicode_GET_SIZE(u));
826 s += PyUnicode_GET_SIZE(u);
827 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000828 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000829 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000830 case 'U':
831 {
832 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000833 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
834 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
835 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000836 break;
837 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000838 case 'V':
839 {
840 PyObject *obj = va_arg(vargs, PyObject *);
841 const char *str = va_arg(vargs, const char *);
842 if (obj) {
843 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
844 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
845 s += size;
846 } else {
847 appendstring(str);
848 }
849 break;
850 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000851 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000852 case 'R':
853 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000854 Py_UNICODE *ucopy;
855 Py_ssize_t usize;
856 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000857 /* unused, since we already have the result */
858 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000859 ucopy = PyUnicode_AS_UNICODE(*callresult);
860 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000861 for (upos = 0; upos<usize;)
862 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000863 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000864 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000865 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000866 ++callresult;
867 break;
868 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000869 case 'p':
870 sprintf(buffer, "%p", va_arg(vargs, void*));
871 /* %p is ill-defined: ensure leading 0x. */
872 if (buffer[1] == 'X')
873 buffer[1] = 'x';
874 else if (buffer[1] != 'x') {
875 memmove(buffer+2, buffer, strlen(buffer)+1);
876 buffer[0] = '0';
877 buffer[1] = 'x';
878 }
879 appendstring(buffer);
880 break;
881 case '%':
882 *s++ = '%';
883 break;
884 default:
885 appendstring(p);
886 goto end;
887 }
888 } else
889 *s++ = *f;
890 }
891
892 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000893 if (callresults)
894 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000895 if (abuffer)
896 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000897 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
898 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000899 fail:
900 if (callresults) {
901 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000902 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000903 Py_DECREF(*callresult2);
904 ++callresult2;
905 }
906 PyMem_Free(callresults);
907 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000908 if (abuffer)
909 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000910 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000911}
912
913#undef appendstring
914
915PyObject *
916PyUnicode_FromFormat(const char *format, ...)
917{
918 PyObject* ret;
919 va_list vargs;
920
921#ifdef HAVE_STDARG_PROTOTYPES
922 va_start(vargs, format);
923#else
924 va_start(vargs);
925#endif
926 ret = PyUnicode_FromFormatV(format, vargs);
927 va_end(vargs);
928 return ret;
929}
930
Martin v. Löwis18e16552006-02-15 17:27:45 +0000931Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
932 wchar_t *w,
933 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934{
935 if (unicode == NULL) {
936 PyErr_BadInternalCall();
937 return -1;
938 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000939
940 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000942 size = PyUnicode_GET_SIZE(unicode) + 1;
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944#ifdef HAVE_USABLE_WCHAR_T
945 memcpy(w, unicode->str, size * sizeof(wchar_t));
946#else
947 {
948 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000949 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000950 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000951 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000952 *w++ = *u++;
953 }
954#endif
955
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000956 if (size > PyUnicode_GET_SIZE(unicode))
957 return PyUnicode_GET_SIZE(unicode);
958 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000959 return size;
960}
961
962#endif
963
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000964PyObject *PyUnicode_FromOrdinal(int ordinal)
965{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000966 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000967
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000968 if (ordinal < 0 || ordinal > 0x10ffff) {
969 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000970 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000971 return NULL;
972 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000973
974#ifndef Py_UNICODE_WIDE
975 if (ordinal > 0xffff) {
976 ordinal -= 0x10000;
977 s[0] = 0xD800 | (ordinal >> 10);
978 s[1] = 0xDC00 | (ordinal & 0x3FF);
979 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000980 }
981#endif
982
Hye-Shik Chang40574832004-04-06 07:24:51 +0000983 s[0] = (Py_UNICODE)ordinal;
984 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000985}
986
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987PyObject *PyUnicode_FromObject(register PyObject *obj)
988{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000989 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +0000990 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000991 if (PyUnicode_CheckExact(obj)) {
992 Py_INCREF(obj);
993 return obj;
994 }
995 if (PyUnicode_Check(obj)) {
996 /* For a Unicode subtype that's not a Unicode object,
997 return a true Unicode object with the same data. */
998 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
999 PyUnicode_GET_SIZE(obj));
1000 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001001 PyErr_Format(PyExc_TypeError,
1002 "Can't convert '%.100s' object to str implicitly",
1003 Py_Type(obj)->tp_name);
1004 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001005}
1006
1007PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1008 const char *encoding,
1009 const char *errors)
1010{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001011 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001012 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001013 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001014
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015 if (obj == NULL) {
1016 PyErr_BadInternalCall();
1017 return NULL;
1018 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001019
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001020 if (PyUnicode_Check(obj)) {
1021 PyErr_SetString(PyExc_TypeError,
1022 "decoding Unicode is not supported");
1023 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001024 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001025
1026 /* Coerce object */
1027 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001028 s = PyString_AS_STRING(obj);
1029 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001030 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001031 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1032 /* Overwrite the error message with something more useful in
1033 case of a TypeError. */
1034 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001035 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001036 "coercing to Unicode: need string or buffer, "
1037 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001038 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001039 goto onError;
1040 }
Tim Petersced69f82003-09-16 20:30:58 +00001041
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001042 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 if (len == 0) {
1044 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001045 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046 }
Tim Petersced69f82003-09-16 20:30:58 +00001047 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001048 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001049
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001050 return v;
1051
1052 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054}
1055
1056PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001057 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058 const char *encoding,
1059 const char *errors)
1060{
1061 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001062 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001063 char lower[20]; /* Enough for any encoding name we recognize */
1064 char *l;
1065 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066
1067 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001068 encoding = PyUnicode_GetDefaultEncoding();
1069
1070 /* Convert encoding to lower case and replace '_' with '-' in order to
1071 catch e.g. UTF_8 */
1072 e = encoding;
1073 l = lower;
1074 while (*e && l < &lower[(sizeof lower) - 2]) {
1075 if (ISUPPER(*e)) {
1076 *l++ = TOLOWER(*e++);
1077 }
1078 else if (*e == '_') {
1079 *l++ = '-';
1080 e++;
1081 }
1082 else {
1083 *l++ = *e++;
1084 }
1085 }
1086 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001087
1088 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001089 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001090 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001091 else if ((strcmp(lower, "latin-1") == 0) ||
1092 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001093 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001094#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001095 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001096 return PyUnicode_DecodeMBCS(s, size, errors);
1097#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001098 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001099 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001100 else if (strcmp(lower, "utf-16") == 0)
1101 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1102 else if (strcmp(lower, "utf-32") == 0)
1103 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001104
1105 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001106 buffer = NULL;
1107 if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1108 goto onError;
1109 buffer = PyMemoryView_FromMemory(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (buffer == NULL)
1111 goto onError;
1112 unicode = PyCodec_Decode(buffer, encoding, errors);
1113 if (unicode == NULL)
1114 goto onError;
1115 if (!PyUnicode_Check(unicode)) {
1116 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001117 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001118 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119 Py_DECREF(unicode);
1120 goto onError;
1121 }
1122 Py_DECREF(buffer);
1123 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001124
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125 onError:
1126 Py_XDECREF(buffer);
1127 return NULL;
1128}
1129
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001130PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1131 const char *encoding,
1132 const char *errors)
1133{
1134 PyObject *v;
1135
1136 if (!PyUnicode_Check(unicode)) {
1137 PyErr_BadArgument();
1138 goto onError;
1139 }
1140
1141 if (encoding == NULL)
1142 encoding = PyUnicode_GetDefaultEncoding();
1143
1144 /* Decode via the codec registry */
1145 v = PyCodec_Decode(unicode, encoding, errors);
1146 if (v == NULL)
1147 goto onError;
1148 return v;
1149
1150 onError:
1151 return NULL;
1152}
1153
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001155 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 const char *encoding,
1157 const char *errors)
1158{
1159 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001160
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 unicode = PyUnicode_FromUnicode(s, size);
1162 if (unicode == NULL)
1163 return NULL;
1164 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1165 Py_DECREF(unicode);
1166 return v;
1167}
1168
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001169PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1170 const char *encoding,
1171 const char *errors)
1172{
1173 PyObject *v;
1174
1175 if (!PyUnicode_Check(unicode)) {
1176 PyErr_BadArgument();
1177 goto onError;
1178 }
1179
1180 if (encoding == NULL)
1181 encoding = PyUnicode_GetDefaultEncoding();
1182
1183 /* Encode via the codec registry */
1184 v = PyCodec_Encode(unicode, encoding, errors);
1185 if (v == NULL)
1186 goto onError;
1187 return v;
1188
1189 onError:
1190 return NULL;
1191}
1192
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1194 const char *encoding,
1195 const char *errors)
1196{
1197 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001198
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199 if (!PyUnicode_Check(unicode)) {
1200 PyErr_BadArgument();
1201 goto onError;
1202 }
Fred Drakee4315f52000-05-09 19:53:39 +00001203
Tim Petersced69f82003-09-16 20:30:58 +00001204 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001205 encoding = PyUnicode_GetDefaultEncoding();
1206
1207 /* Shortcuts for common default encodings */
1208 if (errors == NULL) {
1209 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001210 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001211 else if (strcmp(encoding, "latin-1") == 0)
1212 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001213#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1214 else if (strcmp(encoding, "mbcs") == 0)
1215 return PyUnicode_AsMBCSString(unicode);
1216#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001217 else if (strcmp(encoding, "ascii") == 0)
1218 return PyUnicode_AsASCIIString(unicode);
1219 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220
1221 /* Encode via the codec registry */
1222 v = PyCodec_Encode(unicode, encoding, errors);
1223 if (v == NULL)
1224 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001225 assert(PyString_Check(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001227
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228 onError:
1229 return NULL;
1230}
1231
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001232PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1233 const char *errors)
1234{
1235 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001236 if (v)
1237 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001238 if (errors != NULL)
1239 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001240 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001241 PyUnicode_GET_SIZE(unicode),
1242 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001243 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001244 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001245 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001246 return v;
1247}
1248
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001249PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001250PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001251 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001252 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1253}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001254
Christian Heimes5894ba72007-11-04 11:43:14 +00001255PyObject*
1256PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1257{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001258 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1259 can be undefined. If it is case, decode using UTF-8. The following assumes
1260 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1261 bootstrapping process where the codecs aren't ready yet.
1262 */
1263 if (Py_FileSystemDefaultEncoding) {
1264#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001265 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001266 return PyUnicode_DecodeMBCS(s, size, "replace");
1267 }
1268#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001269 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001270 return PyUnicode_DecodeUTF8(s, size, "replace");
1271 }
1272#endif
1273 return PyUnicode_Decode(s, size,
1274 Py_FileSystemDefaultEncoding,
1275 "replace");
1276 }
1277 else {
1278 return PyUnicode_DecodeUTF8(s, size, "replace");
1279 }
1280}
1281
Martin v. Löwis5b222132007-06-10 09:51:05 +00001282char*
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001283PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001284{
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001285 PyObject *str8;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001286 if (!PyUnicode_Check(unicode)) {
1287 PyErr_BadArgument();
1288 return NULL;
1289 }
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001290 str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1291 if (str8 == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001292 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001293 if (psize != NULL)
1294 *psize = PyString_GET_SIZE(str8);
1295 return PyString_AS_STRING(str8);
1296}
1297
1298char*
1299PyUnicode_AsString(PyObject *unicode)
1300{
1301 return PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001302}
1303
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1305{
1306 if (!PyUnicode_Check(unicode)) {
1307 PyErr_BadArgument();
1308 goto onError;
1309 }
1310 return PyUnicode_AS_UNICODE(unicode);
1311
1312 onError:
1313 return NULL;
1314}
1315
Martin v. Löwis18e16552006-02-15 17:27:45 +00001316Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317{
1318 if (!PyUnicode_Check(unicode)) {
1319 PyErr_BadArgument();
1320 goto onError;
1321 }
1322 return PyUnicode_GET_SIZE(unicode);
1323
1324 onError:
1325 return -1;
1326}
1327
Thomas Wouters78890102000-07-22 19:25:51 +00001328const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001329{
1330 return unicode_default_encoding;
1331}
1332
1333int PyUnicode_SetDefaultEncoding(const char *encoding)
1334{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001335 if (strcmp(encoding, unicode_default_encoding) != 0) {
1336 PyErr_Format(PyExc_ValueError,
1337 "Can only set default encoding to %s",
1338 unicode_default_encoding);
1339 return -1;
1340 }
Fred Drakee4315f52000-05-09 19:53:39 +00001341 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001342}
1343
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344/* error handling callback helper:
1345 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001346 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001347 and adjust various state variables.
1348 return 0 on success, -1 on error
1349*/
1350
1351static
1352int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1353 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001354 const char **input, const char **inend, Py_ssize_t *startinpos,
1355 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001356 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001357{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001358 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001359
1360 PyObject *restuple = NULL;
1361 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001362 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001363 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001364 Py_ssize_t requiredsize;
1365 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001366 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001367 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001368 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001369 int res = -1;
1370
1371 if (*errorHandler == NULL) {
1372 *errorHandler = PyCodec_LookupError(errors);
1373 if (*errorHandler == NULL)
1374 goto onError;
1375 }
1376
1377 if (*exceptionObject == NULL) {
1378 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001379 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001380 if (*exceptionObject == NULL)
1381 goto onError;
1382 }
1383 else {
1384 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1385 goto onError;
1386 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1387 goto onError;
1388 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1389 goto onError;
1390 }
1391
1392 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1393 if (restuple == NULL)
1394 goto onError;
1395 if (!PyTuple_Check(restuple)) {
1396 PyErr_Format(PyExc_TypeError, &argparse[4]);
1397 goto onError;
1398 }
1399 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1400 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001401
1402 /* Copy back the bytes variables, which might have been modified by the
1403 callback */
1404 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1405 if (!inputobj)
1406 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001407 if (!PyString_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001408 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1409 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001410 *input = PyString_AS_STRING(inputobj);
1411 insize = PyString_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001412 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001413 /* we can DECREF safely, as the exception has another reference,
1414 so the object won't go away. */
1415 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001417 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001418 newpos = insize+newpos;
1419 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001420 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001421 goto onError;
1422 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001423
1424 /* need more space? (at least enough for what we
1425 have+the replacement+the rest of the string (starting
1426 at the new input position), so we won't have to check space
1427 when there are no errors in the rest of the string) */
1428 repptr = PyUnicode_AS_UNICODE(repunicode);
1429 repsize = PyUnicode_GET_SIZE(repunicode);
1430 requiredsize = *outpos + repsize + insize-newpos;
1431 if (requiredsize > outsize) {
1432 if (requiredsize<2*outsize)
1433 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001434 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 goto onError;
1436 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1437 }
1438 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001439 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001440 Py_UNICODE_COPY(*outptr, repptr, repsize);
1441 *outptr += repsize;
1442 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001444 /* we made it! */
1445 res = 0;
1446
1447 onError:
1448 Py_XDECREF(restuple);
1449 return res;
1450}
1451
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001452/* --- UTF-7 Codec -------------------------------------------------------- */
1453
1454/* see RFC2152 for details */
1455
Tim Petersced69f82003-09-16 20:30:58 +00001456static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001457char utf7_special[128] = {
1458 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1459 encoded:
1460 0 - not special
1461 1 - special
1462 2 - whitespace (optional)
1463 3 - RFC2152 Set O (optional) */
1464 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1465 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1466 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1468 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1469 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1470 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1471 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1472
1473};
1474
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001475/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1476 warnings about the comparison always being false; since
1477 utf7_special[0] is 1, we can safely make that one comparison
1478 true */
1479
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001481 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001482 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001483 (encodeO && (utf7_special[(c)] == 3)))
1484
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001485#define B64(n) \
1486 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1487#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001488 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001489#define UB64(c) \
1490 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1491 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001492
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001493#define ENCODE(out, ch, bits) \
1494 while (bits >= 6) { \
1495 *out++ = B64(ch >> (bits-6)); \
1496 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001497 }
1498
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001499#define DECODE(out, ch, bits, surrogate) \
1500 while (bits >= 16) { \
1501 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1502 bits -= 16; \
1503 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001504 /* We have already generated an error for the high surrogate \
1505 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001506 surrogate = 0; \
1507 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001508 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001509 it in a 16-bit character */ \
1510 surrogate = 1; \
1511 errmsg = "code pairs are not supported"; \
1512 goto utf7Error; \
1513 } else { \
1514 *out++ = outCh; \
1515 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001516 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001517
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001518PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001519 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001520 const char *errors)
1521{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001522 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t startinpos;
1524 Py_ssize_t endinpos;
1525 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526 const char *e;
1527 PyUnicodeObject *unicode;
1528 Py_UNICODE *p;
1529 const char *errmsg = "";
1530 int inShift = 0;
1531 unsigned int bitsleft = 0;
1532 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001533 int surrogate = 0;
1534 PyObject *errorHandler = NULL;
1535 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001536
1537 unicode = _PyUnicode_New(size);
1538 if (!unicode)
1539 return NULL;
1540 if (size == 0)
1541 return (PyObject *)unicode;
1542
1543 p = unicode->str;
1544 e = s + size;
1545
1546 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 Py_UNICODE ch;
1548 restart:
1549 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550
1551 if (inShift) {
1552 if ((ch == '-') || !B64CHAR(ch)) {
1553 inShift = 0;
1554 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001555
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001556 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1557 if (bitsleft >= 6) {
1558 /* The shift sequence has a partial character in it. If
1559 bitsleft < 6 then we could just classify it as padding
1560 but that is not the case here */
1561
1562 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001563 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001564 }
1565 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001566 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567 here so indicate the potential of a misencoded character. */
1568
1569 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1570 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1571 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001572 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001573 }
1574
1575 if (ch == '-') {
1576 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001577 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001578 inShift = 1;
1579 }
1580 } else if (SPECIAL(ch,0,0)) {
1581 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001582 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 } else {
1584 *p++ = ch;
1585 }
1586 } else {
1587 charsleft = (charsleft << 6) | UB64(ch);
1588 bitsleft += 6;
1589 s++;
1590 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1591 }
1592 }
1593 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001594 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 s++;
1596 if (s < e && *s == '-') {
1597 s++;
1598 *p++ = '+';
1599 } else
1600 {
1601 inShift = 1;
1602 bitsleft = 0;
1603 }
1604 }
1605 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001606 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607 errmsg = "unexpected special character";
1608 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001609 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001610 }
1611 else {
1612 *p++ = ch;
1613 s++;
1614 }
1615 continue;
1616 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001617 outpos = p-PyUnicode_AS_UNICODE(unicode);
1618 endinpos = s-starts;
1619 if (unicode_decode_call_errorhandler(
1620 errors, &errorHandler,
1621 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001622 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001623 (PyObject **)&unicode, &outpos, &p))
1624 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001625 }
1626
1627 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 outpos = p-PyUnicode_AS_UNICODE(unicode);
1629 endinpos = size;
1630 if (unicode_decode_call_errorhandler(
1631 errors, &errorHandler,
1632 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001633 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001634 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001636 if (s < e)
1637 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001638 }
1639
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001640 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001641 goto onError;
1642
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001643 Py_XDECREF(errorHandler);
1644 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001645 return (PyObject *)unicode;
1646
1647onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001648 Py_XDECREF(errorHandler);
1649 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650 Py_DECREF(unicode);
1651 return NULL;
1652}
1653
1654
1655PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001656 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657 int encodeSetO,
1658 int encodeWhiteSpace,
1659 const char *errors)
1660{
Guido van Rossum98297ee2007-11-06 21:34:58 +00001661 PyObject *v, *result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001663 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001665 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001666 unsigned int bitsleft = 0;
1667 unsigned long charsleft = 0;
1668 char * out;
1669 char * start;
1670
1671 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001672 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673
Walter Dörwald51ab4142007-05-05 14:43:36 +00001674 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001675 if (v == NULL)
1676 return NULL;
1677
Walter Dörwald51ab4142007-05-05 14:43:36 +00001678 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 for (;i < size; ++i) {
1680 Py_UNICODE ch = s[i];
1681
1682 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001683 if (ch == '+') {
1684 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001685 *out++ = '-';
1686 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1687 charsleft = ch;
1688 bitsleft = 16;
1689 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001690 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001692 } else {
1693 *out++ = (char) ch;
1694 }
1695 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001696 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1697 *out++ = B64(charsleft << (6-bitsleft));
1698 charsleft = 0;
1699 bitsleft = 0;
1700 /* Characters not in the BASE64 set implicitly unshift the sequence
1701 so no '-' is required, except if the character is itself a '-' */
1702 if (B64CHAR(ch) || ch == '-') {
1703 *out++ = '-';
1704 }
1705 inShift = 0;
1706 *out++ = (char) ch;
1707 } else {
1708 bitsleft += 16;
1709 charsleft = (charsleft << 16) | ch;
1710 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1711
1712 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001713 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001714 or '-' then the shift sequence will be terminated implicitly and we
1715 don't have to insert a '-'. */
1716
1717 if (bitsleft == 0) {
1718 if (i + 1 < size) {
1719 Py_UNICODE ch2 = s[i+1];
1720
1721 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001722
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001723 } else if (B64CHAR(ch2) || ch2 == '-') {
1724 *out++ = '-';
1725 inShift = 0;
1726 } else {
1727 inShift = 0;
1728 }
1729
1730 }
1731 else {
1732 *out++ = '-';
1733 inShift = 0;
1734 }
1735 }
Tim Petersced69f82003-09-16 20:30:58 +00001736 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001737 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001738 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001739 if (bitsleft) {
1740 *out++= B64(charsleft << (6-bitsleft) );
1741 *out++ = '-';
1742 }
1743
Guido van Rossum98297ee2007-11-06 21:34:58 +00001744 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start);
1745 Py_DECREF(v);
1746 return result;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001747}
1748
1749#undef SPECIAL
1750#undef B64
1751#undef B64CHAR
1752#undef UB64
1753#undef ENCODE
1754#undef DECODE
1755
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756/* --- UTF-8 Codec -------------------------------------------------------- */
1757
Tim Petersced69f82003-09-16 20:30:58 +00001758static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759char utf8_code_length[256] = {
1760 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1761 illegal prefix. see RFC 2279 for details */
1762 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1763 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1764 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1765 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1766 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1767 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1768 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1769 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1770 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1771 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1772 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1773 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1774 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1775 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1776 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1777 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1778};
1779
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001781 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 const char *errors)
1783{
Walter Dörwald69652032004-09-07 20:24:22 +00001784 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1785}
1786
1787PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001788 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001789 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001790 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001791{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001794 Py_ssize_t startinpos;
1795 Py_ssize_t endinpos;
1796 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 const char *e;
1798 PyUnicodeObject *unicode;
1799 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001800 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801 PyObject *errorHandler = NULL;
1802 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803
1804 /* Note: size will always be longer than the resulting Unicode
1805 character count */
1806 unicode = _PyUnicode_New(size);
1807 if (!unicode)
1808 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001809 if (size == 0) {
1810 if (consumed)
1811 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001813 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814
1815 /* Unpack UTF-8 encoded data */
1816 p = unicode->str;
1817 e = s + size;
1818
1819 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001820 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821
1822 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001823 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824 s++;
1825 continue;
1826 }
1827
1828 n = utf8_code_length[ch];
1829
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001830 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001831 if (consumed)
1832 break;
1833 else {
1834 errmsg = "unexpected end of data";
1835 startinpos = s-starts;
1836 endinpos = size;
1837 goto utf8Error;
1838 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840
1841 switch (n) {
1842
1843 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001844 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 startinpos = s-starts;
1846 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001847 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848
1849 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001850 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001851 startinpos = s-starts;
1852 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001853 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
1855 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001856 if ((s[1] & 0xc0) != 0x80) {
1857 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 startinpos = s-starts;
1859 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001860 goto utf8Error;
1861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001863 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001864 startinpos = s-starts;
1865 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001866 errmsg = "illegal encoding";
1867 goto utf8Error;
1868 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001870 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 break;
1872
1873 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001874 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001875 (s[2] & 0xc0) != 0x80) {
1876 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877 startinpos = s-starts;
1878 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001879 goto utf8Error;
1880 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001881 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001882 if (ch < 0x0800) {
1883 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001884 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001885
1886 XXX For wide builds (UCS-4) we should probably try
1887 to recombine the surrogates into a single code
1888 unit.
1889 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001890 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001891 startinpos = s-starts;
1892 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001893 goto utf8Error;
1894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001896 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001897 break;
1898
1899 case 4:
1900 if ((s[1] & 0xc0) != 0x80 ||
1901 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001902 (s[3] & 0xc0) != 0x80) {
1903 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001904 startinpos = s-starts;
1905 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001906 goto utf8Error;
1907 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001908 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1909 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1910 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001911 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001912 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001913 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001914 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001915 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001916 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001917 startinpos = s-starts;
1918 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001919 goto utf8Error;
1920 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001921#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001922 *p++ = (Py_UNICODE)ch;
1923#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001924 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001925
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001926 /* translate from 10000..10FFFF to 0..FFFF */
1927 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001928
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001929 /* high surrogate = top 10 bits added to D800 */
1930 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001931
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001932 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001933 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001934#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935 break;
1936
1937 default:
1938 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001939 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 startinpos = s-starts;
1941 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001942 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943 }
1944 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001945 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001946
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001947 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001948 outpos = p-PyUnicode_AS_UNICODE(unicode);
1949 if (unicode_decode_call_errorhandler(
1950 errors, &errorHandler,
1951 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001952 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001953 (PyObject **)&unicode, &outpos, &p))
1954 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 }
Walter Dörwald69652032004-09-07 20:24:22 +00001956 if (consumed)
1957 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001958
1959 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001960 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961 goto onError;
1962
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001963 Py_XDECREF(errorHandler);
1964 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 return (PyObject *)unicode;
1966
1967onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 Py_XDECREF(errorHandler);
1969 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970 Py_DECREF(unicode);
1971 return NULL;
1972}
1973
Tim Peters602f7402002-04-27 18:03:26 +00001974/* Allocation strategy: if the string is short, convert into a stack buffer
1975 and allocate exactly as much space needed at the end. Else allocate the
1976 maximum possible needed (4 result bytes per Unicode character), and return
1977 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001978*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001979PyObject *
1980PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001981 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001982 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983{
Tim Peters602f7402002-04-27 18:03:26 +00001984#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001985
Guido van Rossum98297ee2007-11-06 21:34:58 +00001986 Py_ssize_t i; /* index into s of next input byte */
1987 PyObject *result; /* result string object */
1988 char *p; /* next free byte in output buffer */
1989 Py_ssize_t nallocated; /* number of result bytes allocated */
1990 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001991 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001992
Tim Peters602f7402002-04-27 18:03:26 +00001993 assert(s != NULL);
1994 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995
Tim Peters602f7402002-04-27 18:03:26 +00001996 if (size <= MAX_SHORT_UNICHARS) {
1997 /* Write into the stack buffer; nallocated can't overflow.
1998 * At the end, we'll allocate exactly as much heap space as it
1999 * turns out we need.
2000 */
2001 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002002 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002003 p = stackbuf;
2004 }
2005 else {
2006 /* Overallocate on the heap, and give the excess back at the end. */
2007 nallocated = size * 4;
2008 if (nallocated / 4 != size) /* overflow! */
2009 return PyErr_NoMemory();
Guido van Rossum98297ee2007-11-06 21:34:58 +00002010 result = PyString_FromStringAndSize(NULL, nallocated);
2011 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002012 return NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00002013 p = PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002014 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002015
Tim Peters602f7402002-04-27 18:03:26 +00002016 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002017 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002018
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002019 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002020 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002022
Guido van Rossumd57fd912000-03-10 22:53:23 +00002023 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002024 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002025 *p++ = (char)(0xc0 | (ch >> 6));
2026 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002027 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002028 else {
Tim Peters602f7402002-04-27 18:03:26 +00002029 /* Encode UCS2 Unicode ordinals */
2030 if (ch < 0x10000) {
2031 /* Special case: check for high surrogate */
2032 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2033 Py_UCS4 ch2 = s[i];
2034 /* Check for low surrogate and combine the two to
2035 form a UCS4 value */
2036 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002037 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002038 i++;
2039 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002040 }
Tim Peters602f7402002-04-27 18:03:26 +00002041 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002042 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002043 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002044 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2045 *p++ = (char)(0x80 | (ch & 0x3f));
2046 continue;
2047 }
2048encodeUCS4:
2049 /* Encode UCS4 Unicode ordinals */
2050 *p++ = (char)(0xf0 | (ch >> 18));
2051 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2052 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2053 *p++ = (char)(0x80 | (ch & 0x3f));
2054 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002056
Guido van Rossum98297ee2007-11-06 21:34:58 +00002057 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002058 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002059 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002060 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002061 result = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002062 }
2063 else {
2064 /* Cut back to size actually needed. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00002065 nneeded = p - PyString_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002066 assert(nneeded <= nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002067 _PyString_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002068 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002069 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002070
Tim Peters602f7402002-04-27 18:03:26 +00002071#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072}
2073
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2075{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076 if (!PyUnicode_Check(unicode)) {
2077 PyErr_BadArgument();
2078 return NULL;
2079 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002080 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2081 PyUnicode_GET_SIZE(unicode),
2082 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002083}
2084
Walter Dörwald41980ca2007-08-16 21:55:45 +00002085/* --- UTF-32 Codec ------------------------------------------------------- */
2086
2087PyObject *
2088PyUnicode_DecodeUTF32(const char *s,
2089 Py_ssize_t size,
2090 const char *errors,
2091 int *byteorder)
2092{
2093 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2094}
2095
2096PyObject *
2097PyUnicode_DecodeUTF32Stateful(const char *s,
2098 Py_ssize_t size,
2099 const char *errors,
2100 int *byteorder,
2101 Py_ssize_t *consumed)
2102{
2103 const char *starts = s;
2104 Py_ssize_t startinpos;
2105 Py_ssize_t endinpos;
2106 Py_ssize_t outpos;
2107 PyUnicodeObject *unicode;
2108 Py_UNICODE *p;
2109#ifndef Py_UNICODE_WIDE
2110 int i, pairs;
2111#else
2112 const int pairs = 0;
2113#endif
2114 const unsigned char *q, *e;
2115 int bo = 0; /* assume native ordering by default */
2116 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002117 /* Offsets from q for retrieving bytes in the right order. */
2118#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2119 int iorder[] = {0, 1, 2, 3};
2120#else
2121 int iorder[] = {3, 2, 1, 0};
2122#endif
2123 PyObject *errorHandler = NULL;
2124 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002125 /* On narrow builds we split characters outside the BMP into two
2126 codepoints => count how much extra space we need. */
2127#ifndef Py_UNICODE_WIDE
2128 for (i = pairs = 0; i < size/4; i++)
2129 if (((Py_UCS4 *)s)[i] >= 0x10000)
2130 pairs++;
2131#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002132
2133 /* This might be one to much, because of a BOM */
2134 unicode = _PyUnicode_New((size+3)/4+pairs);
2135 if (!unicode)
2136 return NULL;
2137 if (size == 0)
2138 return (PyObject *)unicode;
2139
2140 /* Unpack UTF-32 encoded data */
2141 p = unicode->str;
2142 q = (unsigned char *)s;
2143 e = q + size;
2144
2145 if (byteorder)
2146 bo = *byteorder;
2147
2148 /* Check for BOM marks (U+FEFF) in the input and adjust current
2149 byte order setting accordingly. In native mode, the leading BOM
2150 mark is skipped, in all other modes, it is copied to the output
2151 stream as-is (giving a ZWNBSP character). */
2152 if (bo == 0) {
2153 if (size >= 4) {
2154 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2155 (q[iorder[1]] << 8) | q[iorder[0]];
2156#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2157 if (bom == 0x0000FEFF) {
2158 q += 4;
2159 bo = -1;
2160 }
2161 else if (bom == 0xFFFE0000) {
2162 q += 4;
2163 bo = 1;
2164 }
2165#else
2166 if (bom == 0x0000FEFF) {
2167 q += 4;
2168 bo = 1;
2169 }
2170 else if (bom == 0xFFFE0000) {
2171 q += 4;
2172 bo = -1;
2173 }
2174#endif
2175 }
2176 }
2177
2178 if (bo == -1) {
2179 /* force LE */
2180 iorder[0] = 0;
2181 iorder[1] = 1;
2182 iorder[2] = 2;
2183 iorder[3] = 3;
2184 }
2185 else if (bo == 1) {
2186 /* force BE */
2187 iorder[0] = 3;
2188 iorder[1] = 2;
2189 iorder[2] = 1;
2190 iorder[3] = 0;
2191 }
2192
2193 while (q < e) {
2194 Py_UCS4 ch;
2195 /* remaining bytes at the end? (size should be divisible by 4) */
2196 if (e-q<4) {
2197 if (consumed)
2198 break;
2199 errmsg = "truncated data";
2200 startinpos = ((const char *)q)-starts;
2201 endinpos = ((const char *)e)-starts;
2202 goto utf32Error;
2203 /* The remaining input chars are ignored if the callback
2204 chooses to skip the input */
2205 }
2206 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2207 (q[iorder[1]] << 8) | q[iorder[0]];
2208
2209 if (ch >= 0x110000)
2210 {
2211 errmsg = "codepoint not in range(0x110000)";
2212 startinpos = ((const char *)q)-starts;
2213 endinpos = startinpos+4;
2214 goto utf32Error;
2215 }
2216#ifndef Py_UNICODE_WIDE
2217 if (ch >= 0x10000)
2218 {
2219 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2220 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2221 }
2222 else
2223#endif
2224 *p++ = ch;
2225 q += 4;
2226 continue;
2227 utf32Error:
2228 outpos = p-PyUnicode_AS_UNICODE(unicode);
2229 if (unicode_decode_call_errorhandler(
2230 errors, &errorHandler,
2231 "utf32", errmsg,
2232 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2233 (PyObject **)&unicode, &outpos, &p))
2234 goto onError;
2235 }
2236
2237 if (byteorder)
2238 *byteorder = bo;
2239
2240 if (consumed)
2241 *consumed = (const char *)q-starts;
2242
2243 /* Adjust length */
2244 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2245 goto onError;
2246
2247 Py_XDECREF(errorHandler);
2248 Py_XDECREF(exc);
2249 return (PyObject *)unicode;
2250
2251onError:
2252 Py_DECREF(unicode);
2253 Py_XDECREF(errorHandler);
2254 Py_XDECREF(exc);
2255 return NULL;
2256}
2257
2258PyObject *
2259PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2260 Py_ssize_t size,
2261 const char *errors,
2262 int byteorder)
2263{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002264 PyObject *v, *result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002265 unsigned char *p;
2266#ifndef Py_UNICODE_WIDE
2267 int i, pairs;
2268#else
2269 const int pairs = 0;
2270#endif
2271 /* Offsets from p for storing byte pairs in the right order. */
2272#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2273 int iorder[] = {0, 1, 2, 3};
2274#else
2275 int iorder[] = {3, 2, 1, 0};
2276#endif
2277
2278#define STORECHAR(CH) \
2279 do { \
2280 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2281 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2282 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2283 p[iorder[0]] = (CH) & 0xff; \
2284 p += 4; \
2285 } while(0)
2286
2287 /* In narrow builds we can output surrogate pairs as one codepoint,
2288 so we need less space. */
2289#ifndef Py_UNICODE_WIDE
2290 for (i = pairs = 0; i < size-1; i++)
2291 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2292 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2293 pairs++;
2294#endif
2295 v = PyBytes_FromStringAndSize(NULL,
2296 4 * (size - pairs + (byteorder == 0)));
2297 if (v == NULL)
2298 return NULL;
2299
2300 p = (unsigned char *)PyBytes_AS_STRING(v);
2301 if (byteorder == 0)
2302 STORECHAR(0xFEFF);
2303 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002304 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002305
2306 if (byteorder == -1) {
2307 /* force LE */
2308 iorder[0] = 0;
2309 iorder[1] = 1;
2310 iorder[2] = 2;
2311 iorder[3] = 3;
2312 }
2313 else if (byteorder == 1) {
2314 /* force BE */
2315 iorder[0] = 3;
2316 iorder[1] = 2;
2317 iorder[2] = 1;
2318 iorder[3] = 0;
2319 }
2320
2321 while (size-- > 0) {
2322 Py_UCS4 ch = *s++;
2323#ifndef Py_UNICODE_WIDE
2324 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2325 Py_UCS4 ch2 = *s;
2326 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2327 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2328 s++;
2329 size--;
2330 }
2331 }
2332#endif
2333 STORECHAR(ch);
2334 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002335
2336 done:
2337 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
2338 Py_DECREF(v);
2339 return result;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002340#undef STORECHAR
2341}
2342
2343PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2344{
2345 if (!PyUnicode_Check(unicode)) {
2346 PyErr_BadArgument();
2347 return NULL;
2348 }
2349 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2350 PyUnicode_GET_SIZE(unicode),
2351 NULL,
2352 0);
2353}
2354
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355/* --- UTF-16 Codec ------------------------------------------------------- */
2356
Tim Peters772747b2001-08-09 22:21:55 +00002357PyObject *
2358PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002359 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002360 const char *errors,
2361 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362{
Walter Dörwald69652032004-09-07 20:24:22 +00002363 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2364}
2365
2366PyObject *
2367PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002368 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002369 const char *errors,
2370 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002371 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002372{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002373 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002374 Py_ssize_t startinpos;
2375 Py_ssize_t endinpos;
2376 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377 PyUnicodeObject *unicode;
2378 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002379 const unsigned char *q, *e;
2380 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002381 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002382 /* Offsets from q for retrieving byte pairs in the right order. */
2383#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2384 int ihi = 1, ilo = 0;
2385#else
2386 int ihi = 0, ilo = 1;
2387#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002388 PyObject *errorHandler = NULL;
2389 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390
2391 /* Note: size will always be longer than the resulting Unicode
2392 character count */
2393 unicode = _PyUnicode_New(size);
2394 if (!unicode)
2395 return NULL;
2396 if (size == 0)
2397 return (PyObject *)unicode;
2398
2399 /* Unpack UTF-16 encoded data */
2400 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002401 q = (unsigned char *)s;
2402 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403
2404 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002405 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002407 /* Check for BOM marks (U+FEFF) in the input and adjust current
2408 byte order setting accordingly. In native mode, the leading BOM
2409 mark is skipped, in all other modes, it is copied to the output
2410 stream as-is (giving a ZWNBSP character). */
2411 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002412 if (size >= 2) {
2413 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002414#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002415 if (bom == 0xFEFF) {
2416 q += 2;
2417 bo = -1;
2418 }
2419 else if (bom == 0xFFFE) {
2420 q += 2;
2421 bo = 1;
2422 }
Tim Petersced69f82003-09-16 20:30:58 +00002423#else
Walter Dörwald69652032004-09-07 20:24:22 +00002424 if (bom == 0xFEFF) {
2425 q += 2;
2426 bo = 1;
2427 }
2428 else if (bom == 0xFFFE) {
2429 q += 2;
2430 bo = -1;
2431 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002432#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002433 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002434 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435
Tim Peters772747b2001-08-09 22:21:55 +00002436 if (bo == -1) {
2437 /* force LE */
2438 ihi = 1;
2439 ilo = 0;
2440 }
2441 else if (bo == 1) {
2442 /* force BE */
2443 ihi = 0;
2444 ilo = 1;
2445 }
2446
2447 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002448 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002449 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002450 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002451 if (consumed)
2452 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002453 errmsg = "truncated data";
2454 startinpos = ((const char *)q)-starts;
2455 endinpos = ((const char *)e)-starts;
2456 goto utf16Error;
2457 /* The remaining input chars are ignored if the callback
2458 chooses to skip the input */
2459 }
2460 ch = (q[ihi] << 8) | q[ilo];
2461
Tim Peters772747b2001-08-09 22:21:55 +00002462 q += 2;
2463
Guido van Rossumd57fd912000-03-10 22:53:23 +00002464 if (ch < 0xD800 || ch > 0xDFFF) {
2465 *p++ = ch;
2466 continue;
2467 }
2468
2469 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002470 if (q >= e) {
2471 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002472 startinpos = (((const char *)q)-2)-starts;
2473 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002474 goto utf16Error;
2475 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002476 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002477 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2478 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002479 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002480#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002481 *p++ = ch;
2482 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002483#else
2484 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002485#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002486 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002487 }
2488 else {
2489 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002490 startinpos = (((const char *)q)-4)-starts;
2491 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002492 goto utf16Error;
2493 }
2494
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002496 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002497 startinpos = (((const char *)q)-2)-starts;
2498 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002499 /* Fall through to report the error */
2500
2501 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002502 outpos = p-PyUnicode_AS_UNICODE(unicode);
2503 if (unicode_decode_call_errorhandler(
2504 errors, &errorHandler,
2505 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002506 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002507 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002508 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 }
2510
2511 if (byteorder)
2512 *byteorder = bo;
2513
Walter Dörwald69652032004-09-07 20:24:22 +00002514 if (consumed)
2515 *consumed = (const char *)q-starts;
2516
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002518 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 goto onError;
2520
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002521 Py_XDECREF(errorHandler);
2522 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 return (PyObject *)unicode;
2524
2525onError:
2526 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002527 Py_XDECREF(errorHandler);
2528 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 return NULL;
2530}
2531
Tim Peters772747b2001-08-09 22:21:55 +00002532PyObject *
2533PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002534 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002535 const char *errors,
2536 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002538 PyObject *v, *result;
Tim Peters772747b2001-08-09 22:21:55 +00002539 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002540#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002541 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002542#else
2543 const int pairs = 0;
2544#endif
Tim Peters772747b2001-08-09 22:21:55 +00002545 /* Offsets from p for storing byte pairs in the right order. */
2546#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2547 int ihi = 1, ilo = 0;
2548#else
2549 int ihi = 0, ilo = 1;
2550#endif
2551
2552#define STORECHAR(CH) \
2553 do { \
2554 p[ihi] = ((CH) >> 8) & 0xff; \
2555 p[ilo] = (CH) & 0xff; \
2556 p += 2; \
2557 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002559#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002560 for (i = pairs = 0; i < size; i++)
2561 if (s[i] >= 0x10000)
2562 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002563#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002564 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002565 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 if (v == NULL)
2567 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568
Walter Dörwald3cc34522007-05-04 10:48:27 +00002569 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002571 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002572 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002573 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002574
2575 if (byteorder == -1) {
2576 /* force LE */
2577 ihi = 1;
2578 ilo = 0;
2579 }
2580 else if (byteorder == 1) {
2581 /* force BE */
2582 ihi = 0;
2583 ilo = 1;
2584 }
2585
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002586 while (size-- > 0) {
2587 Py_UNICODE ch = *s++;
2588 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002589#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002590 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002591 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2592 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002594#endif
Tim Peters772747b2001-08-09 22:21:55 +00002595 STORECHAR(ch);
2596 if (ch2)
2597 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002598 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002599
2600 done:
2601 result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
2602 Py_DECREF(v);
2603 return result;
Tim Peters772747b2001-08-09 22:21:55 +00002604#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605}
2606
2607PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2608{
2609 if (!PyUnicode_Check(unicode)) {
2610 PyErr_BadArgument();
2611 return NULL;
2612 }
2613 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2614 PyUnicode_GET_SIZE(unicode),
2615 NULL,
2616 0);
2617}
2618
2619/* --- Unicode Escape Codec ----------------------------------------------- */
2620
Fredrik Lundh06d12682001-01-24 07:59:11 +00002621static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002622
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002624 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 const char *errors)
2626{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002628 Py_ssize_t startinpos;
2629 Py_ssize_t endinpos;
2630 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002631 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002633 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002635 char* message;
2636 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002637 PyObject *errorHandler = NULL;
2638 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002639
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 /* Escaped strings will always be longer than the resulting
2641 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002642 length after conversion to the true value.
2643 (but if the error callback returns a long replacement string
2644 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 v = _PyUnicode_New(size);
2646 if (v == NULL)
2647 goto onError;
2648 if (size == 0)
2649 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002650
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002651 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002653
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654 while (s < end) {
2655 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002656 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002657 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658
2659 /* Non-escape characters are interpreted as Unicode ordinals */
2660 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002661 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 continue;
2663 }
2664
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002665 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666 /* \ - Escapes */
2667 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002668 c = *s++;
2669 if (s > end)
2670 c = '\0'; /* Invalid after \ */
2671 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672
2673 /* \x escapes */
2674 case '\n': break;
2675 case '\\': *p++ = '\\'; break;
2676 case '\'': *p++ = '\''; break;
2677 case '\"': *p++ = '\"'; break;
2678 case 'b': *p++ = '\b'; break;
2679 case 'f': *p++ = '\014'; break; /* FF */
2680 case 't': *p++ = '\t'; break;
2681 case 'n': *p++ = '\n'; break;
2682 case 'r': *p++ = '\r'; break;
2683 case 'v': *p++ = '\013'; break; /* VT */
2684 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2685
2686 /* \OOO (octal) escapes */
2687 case '0': case '1': case '2': case '3':
2688 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002689 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002690 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002691 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00002692 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002693 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002695 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 break;
2697
Fredrik Lundhccc74732001-02-18 22:13:49 +00002698 /* hex escapes */
2699 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002701 digits = 2;
2702 message = "truncated \\xXX escape";
2703 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704
Fredrik Lundhccc74732001-02-18 22:13:49 +00002705 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002707 digits = 4;
2708 message = "truncated \\uXXXX escape";
2709 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710
Fredrik Lundhccc74732001-02-18 22:13:49 +00002711 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002712 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002713 digits = 8;
2714 message = "truncated \\UXXXXXXXX escape";
2715 hexescape:
2716 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002717 outpos = p-PyUnicode_AS_UNICODE(v);
2718 if (s+digits>end) {
2719 endinpos = size;
2720 if (unicode_decode_call_errorhandler(
2721 errors, &errorHandler,
2722 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002723 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724 (PyObject **)&v, &outpos, &p))
2725 goto onError;
2726 goto nextByte;
2727 }
2728 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002729 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002730 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 endinpos = (s+i+1)-starts;
2732 if (unicode_decode_call_errorhandler(
2733 errors, &errorHandler,
2734 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002735 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002736 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002737 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002739 }
2740 chr = (chr<<4) & ~0xF;
2741 if (c >= '0' && c <= '9')
2742 chr += c - '0';
2743 else if (c >= 'a' && c <= 'f')
2744 chr += 10 + c - 'a';
2745 else
2746 chr += 10 + c - 'A';
2747 }
2748 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002749 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002750 /* _decoding_error will have already written into the
2751 target buffer. */
2752 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002753 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002754 /* when we get here, chr is a 32-bit unicode character */
2755 if (chr <= 0xffff)
2756 /* UCS-2 character */
2757 *p++ = (Py_UNICODE) chr;
2758 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002759 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002760 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002761#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002762 *p++ = chr;
2763#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002764 chr -= 0x10000L;
2765 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002766 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002767#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002768 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002769 endinpos = s-starts;
2770 outpos = p-PyUnicode_AS_UNICODE(v);
2771 if (unicode_decode_call_errorhandler(
2772 errors, &errorHandler,
2773 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002774 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002775 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002776 goto onError;
2777 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002778 break;
2779
2780 /* \N{name} */
2781 case 'N':
2782 message = "malformed \\N character escape";
2783 if (ucnhash_CAPI == NULL) {
2784 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002785 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002786 m = PyImport_ImportModule("unicodedata");
2787 if (m == NULL)
2788 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002789 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002790 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002791 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002792 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002793 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002794 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002795 if (ucnhash_CAPI == NULL)
2796 goto ucnhashError;
2797 }
2798 if (*s == '{') {
2799 const char *start = s+1;
2800 /* look for the closing brace */
2801 while (*s != '}' && s < end)
2802 s++;
2803 if (s > start && s < end && *s == '}') {
2804 /* found a name. look it up in the unicode database */
2805 message = "unknown Unicode character name";
2806 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002807 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002808 goto store;
2809 }
2810 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002811 endinpos = s-starts;
2812 outpos = p-PyUnicode_AS_UNICODE(v);
2813 if (unicode_decode_call_errorhandler(
2814 errors, &errorHandler,
2815 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002816 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002818 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002819 break;
2820
2821 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002822 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002823 message = "\\ at end of string";
2824 s--;
2825 endinpos = s-starts;
2826 outpos = p-PyUnicode_AS_UNICODE(v);
2827 if (unicode_decode_call_errorhandler(
2828 errors, &errorHandler,
2829 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002830 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002832 goto onError;
2833 }
2834 else {
2835 *p++ = '\\';
2836 *p++ = (unsigned char)s[-1];
2837 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002838 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002840 nextByte:
2841 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002842 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002843 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002844 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002845 Py_XDECREF(errorHandler);
2846 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002848
Fredrik Lundhccc74732001-02-18 22:13:49 +00002849ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002850 PyErr_SetString(
2851 PyExc_UnicodeError,
2852 "\\N escapes not supported (can't load unicodedata module)"
2853 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002854 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002855 Py_XDECREF(errorHandler);
2856 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002857 return NULL;
2858
Fredrik Lundhccc74732001-02-18 22:13:49 +00002859onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861 Py_XDECREF(errorHandler);
2862 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863 return NULL;
2864}
2865
2866/* Return a Unicode-Escape string version of the Unicode object.
2867
2868 If quotes is true, the string is enclosed in u"" or u'' quotes as
2869 appropriate.
2870
2871*/
2872
Thomas Wouters477c8d52006-05-27 19:21:47 +00002873Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2874 Py_ssize_t size,
2875 Py_UNICODE ch)
2876{
2877 /* like wcschr, but doesn't stop at NULL characters */
2878
2879 while (size-- > 0) {
2880 if (*s == ch)
2881 return s;
2882 s++;
2883 }
2884
2885 return NULL;
2886}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002887
Walter Dörwald79e913e2007-05-12 11:08:06 +00002888static const char *hexdigits = "0123456789abcdef";
2889
2890PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2891 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892{
Guido van Rossum98297ee2007-11-06 21:34:58 +00002893 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895
Thomas Wouters89f507f2006-12-13 04:49:30 +00002896 /* XXX(nnorwitz): rather than over-allocating, it would be
2897 better to choose a different scheme. Perhaps scan the
2898 first N-chars of the string and allocate based on that size.
2899 */
2900 /* Initial allocation is based on the longest-possible unichr
2901 escape.
2902
2903 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2904 unichr, so in this case it's the longest unichr escape. In
2905 narrow (UTF-16) builds this is five chars per source unichr
2906 since there are two unichrs in the surrogate pair, so in narrow
2907 (UTF-16) builds it's not the longest unichr escape.
2908
2909 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2910 so in the narrow (UTF-16) build case it's the longest unichr
2911 escape.
2912 */
2913
Walter Dörwald79e913e2007-05-12 11:08:06 +00002914 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002915#ifdef Py_UNICODE_WIDE
2916 + 10*size
2917#else
2918 + 6*size
2919#endif
2920 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921 if (repr == NULL)
2922 return NULL;
2923
Walter Dörwald79e913e2007-05-12 11:08:06 +00002924 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 while (size-- > 0) {
2927 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002928
Walter Dörwald79e913e2007-05-12 11:08:06 +00002929 /* Escape backslashes */
2930 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002931 *p++ = '\\';
2932 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002933 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002934 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002935
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002936#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002937 /* Map 21-bit characters to '\U00xxxxxx' */
2938 else if (ch >= 0x10000) {
2939 *p++ = '\\';
2940 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002941 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2942 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2943 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2944 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2945 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2946 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2947 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2948 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002949 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002950 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002951#else
2952 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002953 else if (ch >= 0xD800 && ch < 0xDC00) {
2954 Py_UNICODE ch2;
2955 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002956
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002957 ch2 = *s++;
2958 size--;
2959 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2960 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2961 *p++ = '\\';
2962 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002963 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2964 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2965 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2966 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2967 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2968 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2969 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2970 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002971 continue;
2972 }
2973 /* Fall through: isolated surrogates are copied as-is */
2974 s--;
2975 size++;
2976 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002977#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002978
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002980 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 *p++ = '\\';
2982 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002983 *p++ = hexdigits[(ch >> 12) & 0x000F];
2984 *p++ = hexdigits[(ch >> 8) & 0x000F];
2985 *p++ = hexdigits[(ch >> 4) & 0x000F];
2986 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002988
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002989 /* Map special whitespace to '\t', \n', '\r' */
2990 else if (ch == '\t') {
2991 *p++ = '\\';
2992 *p++ = 't';
2993 }
2994 else if (ch == '\n') {
2995 *p++ = '\\';
2996 *p++ = 'n';
2997 }
2998 else if (ch == '\r') {
2999 *p++ = '\\';
3000 *p++ = 'r';
3001 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003002
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003003 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003004 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003005 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003006 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003007 *p++ = hexdigits[(ch >> 4) & 0x000F];
3008 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003009 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003010
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 /* Copy everything else as-is */
3012 else
3013 *p++ = (char) ch;
3014 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015
Guido van Rossum98297ee2007-11-06 21:34:58 +00003016 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr),
3017 p - PyBytes_AS_STRING(repr));
3018 Py_DECREF(repr);
3019 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020}
3021
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3023{
Walter Dörwald79e913e2007-05-12 11:08:06 +00003024 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025 if (!PyUnicode_Check(unicode)) {
3026 PyErr_BadArgument();
3027 return NULL;
3028 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003029 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3030 PyUnicode_GET_SIZE(unicode));
3031
3032 if (!s)
3033 return NULL;
3034 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3035 PyBytes_GET_SIZE(s));
3036 Py_DECREF(s);
3037 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038}
3039
3040/* --- Raw Unicode Escape Codec ------------------------------------------- */
3041
3042PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003043 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 const char *errors)
3045{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003046 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003047 Py_ssize_t startinpos;
3048 Py_ssize_t endinpos;
3049 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 const char *end;
3053 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054 PyObject *errorHandler = NULL;
3055 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003056
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 /* Escaped strings will always be longer than the resulting
3058 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 length after conversion to the true value. (But decoding error
3060 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 v = _PyUnicode_New(size);
3062 if (v == NULL)
3063 goto onError;
3064 if (size == 0)
3065 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 end = s + size;
3068 while (s < end) {
3069 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003070 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003072 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073
3074 /* Non-escape characters are interpreted as Unicode ordinals */
3075 if (*s != '\\') {
3076 *p++ = (unsigned char)*s++;
3077 continue;
3078 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080
3081 /* \u-escapes are only interpreted iff the number of leading
3082 backslashes if odd */
3083 bs = s;
3084 for (;s < end;) {
3085 if (*s != '\\')
3086 break;
3087 *p++ = (unsigned char)*s++;
3088 }
3089 if (((s - bs) & 1) == 0 ||
3090 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003091 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092 continue;
3093 }
3094 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003095 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 s++;
3097
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003098 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003099 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003100 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003101 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003102 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003103 endinpos = s-starts;
3104 if (unicode_decode_call_errorhandler(
3105 errors, &errorHandler,
3106 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003107 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003110 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 }
3112 x = (x<<4) & ~0xF;
3113 if (c >= '0' && c <= '9')
3114 x += c - '0';
3115 else if (c >= 'a' && c <= 'f')
3116 x += 10 + c - 'a';
3117 else
3118 x += 10 + c - 'A';
3119 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003120#ifndef Py_UNICODE_WIDE
3121 if (x > 0x10000) {
3122 if (unicode_decode_call_errorhandler(
3123 errors, &errorHandler,
3124 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003125 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003126 (PyObject **)&v, &outpos, &p))
3127 goto onError;
3128 }
3129#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003130 *p++ = x;
3131 nextByte:
3132 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003134 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003135 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003136 Py_XDECREF(errorHandler);
3137 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003138 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003139
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140 onError:
3141 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003142 Py_XDECREF(errorHandler);
3143 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003144 return NULL;
3145}
3146
3147PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003148 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149{
Guido van Rossum98297ee2007-11-06 21:34:58 +00003150 PyObject *repr, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151 char *p;
3152 char *q;
3153
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003154#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003155 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003156#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003157 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003158#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 if (repr == NULL)
3160 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003161 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003162 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163
Walter Dörwald711005d2007-05-12 12:03:26 +00003164 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 while (size-- > 0) {
3166 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003167#ifdef Py_UNICODE_WIDE
3168 /* Map 32-bit characters to '\Uxxxxxxxx' */
3169 if (ch >= 0x10000) {
3170 *p++ = '\\';
3171 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003172 *p++ = hexdigits[(ch >> 28) & 0xf];
3173 *p++ = hexdigits[(ch >> 24) & 0xf];
3174 *p++ = hexdigits[(ch >> 20) & 0xf];
3175 *p++ = hexdigits[(ch >> 16) & 0xf];
3176 *p++ = hexdigits[(ch >> 12) & 0xf];
3177 *p++ = hexdigits[(ch >> 8) & 0xf];
3178 *p++ = hexdigits[(ch >> 4) & 0xf];
3179 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003180 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003181 else
3182#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183 /* Map 16-bit characters to '\uxxxx' */
3184 if (ch >= 256) {
3185 *p++ = '\\';
3186 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003187 *p++ = hexdigits[(ch >> 12) & 0xf];
3188 *p++ = hexdigits[(ch >> 8) & 0xf];
3189 *p++ = hexdigits[(ch >> 4) & 0xf];
3190 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 }
3192 /* Copy everything else as-is */
3193 else
3194 *p++ = (char) ch;
3195 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003196 size = p - q;
3197
3198 done:
3199 result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size);
3200 Py_DECREF(repr);
3201 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202}
3203
3204PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3205{
Walter Dörwald711005d2007-05-12 12:03:26 +00003206 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003208 PyErr_BadArgument();
3209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003211 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3212 PyUnicode_GET_SIZE(unicode));
3213
3214 if (!s)
3215 return NULL;
3216 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3217 PyBytes_GET_SIZE(s));
3218 Py_DECREF(s);
3219 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220}
3221
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003222/* --- Unicode Internal Codec ------------------------------------------- */
3223
3224PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003225 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003226 const char *errors)
3227{
3228 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003229 Py_ssize_t startinpos;
3230 Py_ssize_t endinpos;
3231 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003232 PyUnicodeObject *v;
3233 Py_UNICODE *p;
3234 const char *end;
3235 const char *reason;
3236 PyObject *errorHandler = NULL;
3237 PyObject *exc = NULL;
3238
Neal Norwitzd43069c2006-01-08 01:12:10 +00003239#ifdef Py_UNICODE_WIDE
3240 Py_UNICODE unimax = PyUnicode_GetMax();
3241#endif
3242
Thomas Wouters89f507f2006-12-13 04:49:30 +00003243 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003244 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3245 if (v == NULL)
3246 goto onError;
3247 if (PyUnicode_GetSize((PyObject *)v) == 0)
3248 return (PyObject *)v;
3249 p = PyUnicode_AS_UNICODE(v);
3250 end = s + size;
3251
3252 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003253 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003254 /* We have to sanity check the raw data, otherwise doom looms for
3255 some malformed UCS-4 data. */
3256 if (
3257 #ifdef Py_UNICODE_WIDE
3258 *p > unimax || *p < 0 ||
3259 #endif
3260 end-s < Py_UNICODE_SIZE
3261 )
3262 {
3263 startinpos = s - starts;
3264 if (end-s < Py_UNICODE_SIZE) {
3265 endinpos = end-starts;
3266 reason = "truncated input";
3267 }
3268 else {
3269 endinpos = s - starts + Py_UNICODE_SIZE;
3270 reason = "illegal code point (> 0x10FFFF)";
3271 }
3272 outpos = p - PyUnicode_AS_UNICODE(v);
3273 if (unicode_decode_call_errorhandler(
3274 errors, &errorHandler,
3275 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003276 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003277 (PyObject **)&v, &outpos, &p)) {
3278 goto onError;
3279 }
3280 }
3281 else {
3282 p++;
3283 s += Py_UNICODE_SIZE;
3284 }
3285 }
3286
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003287 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003288 goto onError;
3289 Py_XDECREF(errorHandler);
3290 Py_XDECREF(exc);
3291 return (PyObject *)v;
3292
3293 onError:
3294 Py_XDECREF(v);
3295 Py_XDECREF(errorHandler);
3296 Py_XDECREF(exc);
3297 return NULL;
3298}
3299
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300/* --- Latin-1 Codec ------------------------------------------------------ */
3301
3302PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003303 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 const char *errors)
3305{
3306 PyUnicodeObject *v;
3307 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003308
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003310 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003311 Py_UNICODE r = *(unsigned char*)s;
3312 return PyUnicode_FromUnicode(&r, 1);
3313 }
3314
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 v = _PyUnicode_New(size);
3316 if (v == NULL)
3317 goto onError;
3318 if (size == 0)
3319 return (PyObject *)v;
3320 p = PyUnicode_AS_UNICODE(v);
3321 while (size-- > 0)
3322 *p++ = (unsigned char)*s++;
3323 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003324
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 onError:
3326 Py_XDECREF(v);
3327 return NULL;
3328}
3329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330/* create or adjust a UnicodeEncodeError */
3331static void make_encode_exception(PyObject **exceptionObject,
3332 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003333 const Py_UNICODE *unicode, Py_ssize_t size,
3334 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 if (*exceptionObject == NULL) {
3338 *exceptionObject = PyUnicodeEncodeError_Create(
3339 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 }
3341 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3343 goto onError;
3344 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3345 goto onError;
3346 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3347 goto onError;
3348 return;
3349 onError:
3350 Py_DECREF(*exceptionObject);
3351 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 }
3353}
3354
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355/* raises a UnicodeEncodeError */
3356static void raise_encode_exception(PyObject **exceptionObject,
3357 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003358 const Py_UNICODE *unicode, Py_ssize_t size,
3359 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003360 const char *reason)
3361{
3362 make_encode_exception(exceptionObject,
3363 encoding, unicode, size, startpos, endpos, reason);
3364 if (*exceptionObject != NULL)
3365 PyCodec_StrictErrors(*exceptionObject);
3366}
3367
3368/* error handling callback helper:
3369 build arguments, call the callback and check the arguments,
3370 put the result into newpos and return the replacement string, which
3371 has to be freed by the caller */
3372static PyObject *unicode_encode_call_errorhandler(const char *errors,
3373 PyObject **errorHandler,
3374 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003375 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3376 Py_ssize_t startpos, Py_ssize_t endpos,
3377 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003379 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003380
3381 PyObject *restuple;
3382 PyObject *resunicode;
3383
3384 if (*errorHandler == NULL) {
3385 *errorHandler = PyCodec_LookupError(errors);
3386 if (*errorHandler == NULL)
3387 return NULL;
3388 }
3389
3390 make_encode_exception(exceptionObject,
3391 encoding, unicode, size, startpos, endpos, reason);
3392 if (*exceptionObject == NULL)
3393 return NULL;
3394
3395 restuple = PyObject_CallFunctionObjArgs(
3396 *errorHandler, *exceptionObject, NULL);
3397 if (restuple == NULL)
3398 return NULL;
3399 if (!PyTuple_Check(restuple)) {
3400 PyErr_Format(PyExc_TypeError, &argparse[4]);
3401 Py_DECREF(restuple);
3402 return NULL;
3403 }
3404 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3405 &resunicode, newpos)) {
3406 Py_DECREF(restuple);
3407 return NULL;
3408 }
3409 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003410 *newpos = size+*newpos;
3411 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003412 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003413 Py_DECREF(restuple);
3414 return NULL;
3415 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003416 Py_INCREF(resunicode);
3417 Py_DECREF(restuple);
3418 return resunicode;
3419}
3420
3421static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003422 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423 const char *errors,
3424 int limit)
3425{
3426 /* output object */
3427 PyObject *res;
3428 /* pointers to the beginning and end+1 of input */
3429 const Py_UNICODE *startp = p;
3430 const Py_UNICODE *endp = p + size;
3431 /* pointer to the beginning of the unencodable characters */
3432 /* const Py_UNICODE *badp = NULL; */
3433 /* pointer into the output */
3434 char *str;
3435 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003436 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003437 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3438 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439 PyObject *errorHandler = NULL;
3440 PyObject *exc = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +00003441 PyObject *result = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003442 /* the following variable is used for caching string comparisons
3443 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3444 int known_errorHandler = -1;
3445
3446 /* allocate enough for a simple encoding without
3447 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003448 if (size == 0)
3449 return PyString_FromStringAndSize(NULL, 0);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003450 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003451 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003452 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003453 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454 ressize = size;
3455
3456 while (p<endp) {
3457 Py_UNICODE c = *p;
3458
3459 /* can we encode this? */
3460 if (c<limit) {
3461 /* no overflow check, because we know that the space is enough */
3462 *str++ = (char)c;
3463 ++p;
3464 }
3465 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003466 Py_ssize_t unicodepos = p-startp;
3467 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003468 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003469 Py_ssize_t repsize;
3470 Py_ssize_t newpos;
3471 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472 Py_UNICODE *uni2;
3473 /* startpos for collecting unencodable chars */
3474 const Py_UNICODE *collstart = p;
3475 const Py_UNICODE *collend = p;
3476 /* find all unecodable characters */
3477 while ((collend < endp) && ((*collend)>=limit))
3478 ++collend;
3479 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3480 if (known_errorHandler==-1) {
3481 if ((errors==NULL) || (!strcmp(errors, "strict")))
3482 known_errorHandler = 1;
3483 else if (!strcmp(errors, "replace"))
3484 known_errorHandler = 2;
3485 else if (!strcmp(errors, "ignore"))
3486 known_errorHandler = 3;
3487 else if (!strcmp(errors, "xmlcharrefreplace"))
3488 known_errorHandler = 4;
3489 else
3490 known_errorHandler = 0;
3491 }
3492 switch (known_errorHandler) {
3493 case 1: /* strict */
3494 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3495 goto onError;
3496 case 2: /* replace */
3497 while (collstart++<collend)
3498 *str++ = '?'; /* fall through */
3499 case 3: /* ignore */
3500 p = collend;
3501 break;
3502 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003503 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504 /* determine replacement size (temporarily (mis)uses p) */
3505 for (p = collstart, repsize = 0; p < collend; ++p) {
3506 if (*p<10)
3507 repsize += 2+1+1;
3508 else if (*p<100)
3509 repsize += 2+2+1;
3510 else if (*p<1000)
3511 repsize += 2+3+1;
3512 else if (*p<10000)
3513 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003514#ifndef Py_UNICODE_WIDE
3515 else
3516 repsize += 2+5+1;
3517#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 else if (*p<100000)
3519 repsize += 2+5+1;
3520 else if (*p<1000000)
3521 repsize += 2+6+1;
3522 else
3523 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003524#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 }
3526 requiredsize = respos+repsize+(endp-collend);
3527 if (requiredsize > ressize) {
3528 if (requiredsize<2*ressize)
3529 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003530 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003532 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 ressize = requiredsize;
3534 }
3535 /* generate replacement (temporarily (mis)uses p) */
3536 for (p = collstart; p < collend; ++p) {
3537 str += sprintf(str, "&#%d;", (int)*p);
3538 }
3539 p = collend;
3540 break;
3541 default:
3542 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3543 encoding, reason, startp, size, &exc,
3544 collstart-startp, collend-startp, &newpos);
3545 if (repunicode == NULL)
3546 goto onError;
3547 /* need more space? (at least enough for what we
3548 have+the replacement+the rest of the string, so
3549 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003550 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 repsize = PyUnicode_GET_SIZE(repunicode);
3552 requiredsize = respos+repsize+(endp-collend);
3553 if (requiredsize > ressize) {
3554 if (requiredsize<2*ressize)
3555 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003556 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 Py_DECREF(repunicode);
3558 goto onError;
3559 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003560 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 ressize = requiredsize;
3562 }
3563 /* check if there is anything unencodable in the replacement
3564 and copy it to the output */
3565 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3566 c = *uni2;
3567 if (c >= limit) {
3568 raise_encode_exception(&exc, encoding, startp, size,
3569 unicodepos, unicodepos+1, reason);
3570 Py_DECREF(repunicode);
3571 goto onError;
3572 }
3573 *str = (char)c;
3574 }
3575 p = startp + newpos;
3576 Py_DECREF(repunicode);
3577 }
3578 }
3579 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003580 result = PyString_FromStringAndSize(PyBytes_AS_STRING(res),
3581 str - PyBytes_AS_STRING(res));
3582 onError:
3583 Py_DECREF(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 Py_XDECREF(errorHandler);
3585 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003586 return result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587}
3588
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003590 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591 const char *errors)
3592{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594}
3595
3596PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3597{
3598 if (!PyUnicode_Check(unicode)) {
3599 PyErr_BadArgument();
3600 return NULL;
3601 }
3602 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3603 PyUnicode_GET_SIZE(unicode),
3604 NULL);
3605}
3606
3607/* --- 7-bit ASCII Codec -------------------------------------------------- */
3608
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003610 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 const char *errors)
3612{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 PyUnicodeObject *v;
3615 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003616 Py_ssize_t startinpos;
3617 Py_ssize_t endinpos;
3618 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619 const char *e;
3620 PyObject *errorHandler = NULL;
3621 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003622
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003624 if (size == 1 && *(unsigned char*)s < 128) {
3625 Py_UNICODE r = *(unsigned char*)s;
3626 return PyUnicode_FromUnicode(&r, 1);
3627 }
Tim Petersced69f82003-09-16 20:30:58 +00003628
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 v = _PyUnicode_New(size);
3630 if (v == NULL)
3631 goto onError;
3632 if (size == 0)
3633 return (PyObject *)v;
3634 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 e = s + size;
3636 while (s < e) {
3637 register unsigned char c = (unsigned char)*s;
3638 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 ++s;
3641 }
3642 else {
3643 startinpos = s-starts;
3644 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003645 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646 if (unicode_decode_call_errorhandler(
3647 errors, &errorHandler,
3648 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003649 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003652 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003654 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003655 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003656 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657 Py_XDECREF(errorHandler);
3658 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003660
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 onError:
3662 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663 Py_XDECREF(errorHandler);
3664 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665 return NULL;
3666}
3667
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003669 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670 const char *errors)
3671{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003672 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673}
3674
3675PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3676{
3677 if (!PyUnicode_Check(unicode)) {
3678 PyErr_BadArgument();
3679 return NULL;
3680 }
3681 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3682 PyUnicode_GET_SIZE(unicode),
3683 NULL);
3684}
3685
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003686#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003687
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003688/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003689
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003690#if SIZEOF_INT < SIZEOF_SSIZE_T
3691#define NEED_RETRY
3692#endif
3693
3694/* XXX This code is limited to "true" double-byte encodings, as
3695 a) it assumes an incomplete character consists of a single byte, and
3696 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3697 encodings, see IsDBCSLeadByteEx documentation. */
3698
3699static int is_dbcs_lead_byte(const char *s, int offset)
3700{
3701 const char *curr = s + offset;
3702
3703 if (IsDBCSLeadByte(*curr)) {
3704 const char *prev = CharPrev(s, curr);
3705 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3706 }
3707 return 0;
3708}
3709
3710/*
3711 * Decode MBCS string into unicode object. If 'final' is set, converts
3712 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3713 */
3714static int decode_mbcs(PyUnicodeObject **v,
3715 const char *s, /* MBCS string */
3716 int size, /* sizeof MBCS string */
3717 int final)
3718{
3719 Py_UNICODE *p;
3720 Py_ssize_t n = 0;
3721 int usize = 0;
3722
3723 assert(size >= 0);
3724
3725 /* Skip trailing lead-byte unless 'final' is set */
3726 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3727 --size;
3728
3729 /* First get the size of the result */
3730 if (size > 0) {
3731 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3732 if (usize == 0) {
3733 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3734 return -1;
3735 }
3736 }
3737
3738 if (*v == NULL) {
3739 /* Create unicode object */
3740 *v = _PyUnicode_New(usize);
3741 if (*v == NULL)
3742 return -1;
3743 }
3744 else {
3745 /* Extend unicode object */
3746 n = PyUnicode_GET_SIZE(*v);
3747 if (_PyUnicode_Resize(v, n + usize) < 0)
3748 return -1;
3749 }
3750
3751 /* Do the conversion */
3752 if (size > 0) {
3753 p = PyUnicode_AS_UNICODE(*v) + n;
3754 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3755 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3756 return -1;
3757 }
3758 }
3759
3760 return size;
3761}
3762
3763PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3764 Py_ssize_t size,
3765 const char *errors,
3766 Py_ssize_t *consumed)
3767{
3768 PyUnicodeObject *v = NULL;
3769 int done;
3770
3771 if (consumed)
3772 *consumed = 0;
3773
3774#ifdef NEED_RETRY
3775 retry:
3776 if (size > INT_MAX)
3777 done = decode_mbcs(&v, s, INT_MAX, 0);
3778 else
3779#endif
3780 done = decode_mbcs(&v, s, (int)size, !consumed);
3781
3782 if (done < 0) {
3783 Py_XDECREF(v);
3784 return NULL;
3785 }
3786
3787 if (consumed)
3788 *consumed += done;
3789
3790#ifdef NEED_RETRY
3791 if (size > INT_MAX) {
3792 s += done;
3793 size -= done;
3794 goto retry;
3795 }
3796#endif
3797
3798 return (PyObject *)v;
3799}
3800
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003801PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003802 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003803 const char *errors)
3804{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003805 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3806}
3807
3808/*
3809 * Convert unicode into string object (MBCS).
3810 * Returns 0 if succeed, -1 otherwise.
3811 */
3812static int encode_mbcs(PyObject **repr,
3813 const Py_UNICODE *p, /* unicode */
3814 int size) /* size of unicode */
3815{
3816 int mbcssize = 0;
3817 Py_ssize_t n = 0;
3818
3819 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003820
3821 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003822 if (size > 0) {
3823 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3824 if (mbcssize == 0) {
3825 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3826 return -1;
3827 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003828 }
3829
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003830 if (*repr == NULL) {
3831 /* Create string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003832 *repr = PyString_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003833 if (*repr == NULL)
3834 return -1;
3835 }
3836 else {
3837 /* Extend string object */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003838 n = PyString_Size(*repr);
3839 if (_PyString_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003840 return -1;
3841 }
3842
3843 /* Do the conversion */
3844 if (size > 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00003845 char *s = PyString_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003846 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3847 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3848 return -1;
3849 }
3850 }
3851
3852 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003853}
3854
3855PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003856 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003857 const char *errors)
3858{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003859 PyObject *repr = NULL;
3860 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003861
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003862#ifdef NEED_RETRY
3863 retry:
3864 if (size > INT_MAX)
3865 ret = encode_mbcs(&repr, p, INT_MAX);
3866 else
3867#endif
3868 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003869
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003870 if (ret < 0) {
3871 Py_XDECREF(repr);
3872 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003873 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003874
3875#ifdef NEED_RETRY
3876 if (size > INT_MAX) {
3877 p += INT_MAX;
3878 size -= INT_MAX;
3879 goto retry;
3880 }
3881#endif
3882
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003883 return repr;
3884}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003885
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003886PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3887{
3888 if (!PyUnicode_Check(unicode)) {
3889 PyErr_BadArgument();
3890 return NULL;
3891 }
3892 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3893 PyUnicode_GET_SIZE(unicode),
3894 NULL);
3895}
3896
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003897#undef NEED_RETRY
3898
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003899#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003900
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901/* --- Character Mapping Codec -------------------------------------------- */
3902
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003904 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905 PyObject *mapping,
3906 const char *errors)
3907{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003909 Py_ssize_t startinpos;
3910 Py_ssize_t endinpos;
3911 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913 PyUnicodeObject *v;
3914 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003915 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003916 PyObject *errorHandler = NULL;
3917 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003918 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003919 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003920
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 /* Default to Latin-1 */
3922 if (mapping == NULL)
3923 return PyUnicode_DecodeLatin1(s, size, errors);
3924
3925 v = _PyUnicode_New(size);
3926 if (v == NULL)
3927 goto onError;
3928 if (size == 0)
3929 return (PyObject *)v;
3930 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003932 if (PyUnicode_CheckExact(mapping)) {
3933 mapstring = PyUnicode_AS_UNICODE(mapping);
3934 maplen = PyUnicode_GET_SIZE(mapping);
3935 while (s < e) {
3936 unsigned char ch = *s;
3937 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003939 if (ch < maplen)
3940 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003942 if (x == 0xfffe) {
3943 /* undefined mapping */
3944 outpos = p-PyUnicode_AS_UNICODE(v);
3945 startinpos = s-starts;
3946 endinpos = startinpos+1;
3947 if (unicode_decode_call_errorhandler(
3948 errors, &errorHandler,
3949 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003950 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003951 (PyObject **)&v, &outpos, &p)) {
3952 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003953 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003954 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003955 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003956 *p++ = x;
3957 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003959 }
3960 else {
3961 while (s < e) {
3962 unsigned char ch = *s;
3963 PyObject *w, *x;
3964
3965 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3966 w = PyInt_FromLong((long)ch);
3967 if (w == NULL)
3968 goto onError;
3969 x = PyObject_GetItem(mapping, w);
3970 Py_DECREF(w);
3971 if (x == NULL) {
3972 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3973 /* No mapping found means: mapping is undefined. */
3974 PyErr_Clear();
3975 x = Py_None;
3976 Py_INCREF(x);
3977 } else
3978 goto onError;
3979 }
3980
3981 /* Apply mapping */
3982 if (PyInt_Check(x)) {
3983 long value = PyInt_AS_LONG(x);
3984 if (value < 0 || value > 65535) {
3985 PyErr_SetString(PyExc_TypeError,
3986 "character mapping must be in range(65536)");
3987 Py_DECREF(x);
3988 goto onError;
3989 }
3990 *p++ = (Py_UNICODE)value;
3991 }
3992 else if (x == Py_None) {
3993 /* undefined mapping */
3994 outpos = p-PyUnicode_AS_UNICODE(v);
3995 startinpos = s-starts;
3996 endinpos = startinpos+1;
3997 if (unicode_decode_call_errorhandler(
3998 errors, &errorHandler,
3999 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004000 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004001 (PyObject **)&v, &outpos, &p)) {
4002 Py_DECREF(x);
4003 goto onError;
4004 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004005 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004006 continue;
4007 }
4008 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004009 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004010
4011 if (targetsize == 1)
4012 /* 1-1 mapping */
4013 *p++ = *PyUnicode_AS_UNICODE(x);
4014
4015 else if (targetsize > 1) {
4016 /* 1-n mapping */
4017 if (targetsize > extrachars) {
4018 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004019 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4020 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004021 (targetsize << 2);
4022 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004023 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004024 if (_PyUnicode_Resize(&v,
4025 PyUnicode_GET_SIZE(v) + needed) < 0) {
4026 Py_DECREF(x);
4027 goto onError;
4028 }
4029 p = PyUnicode_AS_UNICODE(v) + oldpos;
4030 }
4031 Py_UNICODE_COPY(p,
4032 PyUnicode_AS_UNICODE(x),
4033 targetsize);
4034 p += targetsize;
4035 extrachars -= targetsize;
4036 }
4037 /* 1-0 mapping: skip the character */
4038 }
4039 else {
4040 /* wrong return value */
4041 PyErr_SetString(PyExc_TypeError,
4042 "character mapping must return integer, None or unicode");
4043 Py_DECREF(x);
4044 goto onError;
4045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004047 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049 }
4050 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004051 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053 Py_XDECREF(errorHandler);
4054 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004056
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 Py_XDECREF(errorHandler);
4059 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 Py_XDECREF(v);
4061 return NULL;
4062}
4063
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004064/* Charmap encoding: the lookup table */
4065
4066struct encoding_map{
4067 PyObject_HEAD
4068 unsigned char level1[32];
4069 int count2, count3;
4070 unsigned char level23[1];
4071};
4072
4073static PyObject*
4074encoding_map_size(PyObject *obj, PyObject* args)
4075{
4076 struct encoding_map *map = (struct encoding_map*)obj;
4077 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4078 128*map->count3);
4079}
4080
4081static PyMethodDef encoding_map_methods[] = {
4082 {"size", encoding_map_size, METH_NOARGS,
4083 PyDoc_STR("Return the size (in bytes) of this object") },
4084 { 0 }
4085};
4086
4087static void
4088encoding_map_dealloc(PyObject* o)
4089{
4090 PyObject_FREE(o);
4091}
4092
4093static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004094 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004095 "EncodingMap", /*tp_name*/
4096 sizeof(struct encoding_map), /*tp_basicsize*/
4097 0, /*tp_itemsize*/
4098 /* methods */
4099 encoding_map_dealloc, /*tp_dealloc*/
4100 0, /*tp_print*/
4101 0, /*tp_getattr*/
4102 0, /*tp_setattr*/
4103 0, /*tp_compare*/
4104 0, /*tp_repr*/
4105 0, /*tp_as_number*/
4106 0, /*tp_as_sequence*/
4107 0, /*tp_as_mapping*/
4108 0, /*tp_hash*/
4109 0, /*tp_call*/
4110 0, /*tp_str*/
4111 0, /*tp_getattro*/
4112 0, /*tp_setattro*/
4113 0, /*tp_as_buffer*/
4114 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4115 0, /*tp_doc*/
4116 0, /*tp_traverse*/
4117 0, /*tp_clear*/
4118 0, /*tp_richcompare*/
4119 0, /*tp_weaklistoffset*/
4120 0, /*tp_iter*/
4121 0, /*tp_iternext*/
4122 encoding_map_methods, /*tp_methods*/
4123 0, /*tp_members*/
4124 0, /*tp_getset*/
4125 0, /*tp_base*/
4126 0, /*tp_dict*/
4127 0, /*tp_descr_get*/
4128 0, /*tp_descr_set*/
4129 0, /*tp_dictoffset*/
4130 0, /*tp_init*/
4131 0, /*tp_alloc*/
4132 0, /*tp_new*/
4133 0, /*tp_free*/
4134 0, /*tp_is_gc*/
4135};
4136
4137PyObject*
4138PyUnicode_BuildEncodingMap(PyObject* string)
4139{
4140 Py_UNICODE *decode;
4141 PyObject *result;
4142 struct encoding_map *mresult;
4143 int i;
4144 int need_dict = 0;
4145 unsigned char level1[32];
4146 unsigned char level2[512];
4147 unsigned char *mlevel1, *mlevel2, *mlevel3;
4148 int count2 = 0, count3 = 0;
4149
4150 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4151 PyErr_BadArgument();
4152 return NULL;
4153 }
4154 decode = PyUnicode_AS_UNICODE(string);
4155 memset(level1, 0xFF, sizeof level1);
4156 memset(level2, 0xFF, sizeof level2);
4157
4158 /* If there isn't a one-to-one mapping of NULL to \0,
4159 or if there are non-BMP characters, we need to use
4160 a mapping dictionary. */
4161 if (decode[0] != 0)
4162 need_dict = 1;
4163 for (i = 1; i < 256; i++) {
4164 int l1, l2;
4165 if (decode[i] == 0
4166 #ifdef Py_UNICODE_WIDE
4167 || decode[i] > 0xFFFF
4168 #endif
4169 ) {
4170 need_dict = 1;
4171 break;
4172 }
4173 if (decode[i] == 0xFFFE)
4174 /* unmapped character */
4175 continue;
4176 l1 = decode[i] >> 11;
4177 l2 = decode[i] >> 7;
4178 if (level1[l1] == 0xFF)
4179 level1[l1] = count2++;
4180 if (level2[l2] == 0xFF)
4181 level2[l2] = count3++;
4182 }
4183
4184 if (count2 >= 0xFF || count3 >= 0xFF)
4185 need_dict = 1;
4186
4187 if (need_dict) {
4188 PyObject *result = PyDict_New();
4189 PyObject *key, *value;
4190 if (!result)
4191 return NULL;
4192 for (i = 0; i < 256; i++) {
4193 key = value = NULL;
4194 key = PyInt_FromLong(decode[i]);
4195 value = PyInt_FromLong(i);
4196 if (!key || !value)
4197 goto failed1;
4198 if (PyDict_SetItem(result, key, value) == -1)
4199 goto failed1;
4200 Py_DECREF(key);
4201 Py_DECREF(value);
4202 }
4203 return result;
4204 failed1:
4205 Py_XDECREF(key);
4206 Py_XDECREF(value);
4207 Py_DECREF(result);
4208 return NULL;
4209 }
4210
4211 /* Create a three-level trie */
4212 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4213 16*count2 + 128*count3 - 1);
4214 if (!result)
4215 return PyErr_NoMemory();
4216 PyObject_Init(result, &EncodingMapType);
4217 mresult = (struct encoding_map*)result;
4218 mresult->count2 = count2;
4219 mresult->count3 = count3;
4220 mlevel1 = mresult->level1;
4221 mlevel2 = mresult->level23;
4222 mlevel3 = mresult->level23 + 16*count2;
4223 memcpy(mlevel1, level1, 32);
4224 memset(mlevel2, 0xFF, 16*count2);
4225 memset(mlevel3, 0, 128*count3);
4226 count3 = 0;
4227 for (i = 1; i < 256; i++) {
4228 int o1, o2, o3, i2, i3;
4229 if (decode[i] == 0xFFFE)
4230 /* unmapped character */
4231 continue;
4232 o1 = decode[i]>>11;
4233 o2 = (decode[i]>>7) & 0xF;
4234 i2 = 16*mlevel1[o1] + o2;
4235 if (mlevel2[i2] == 0xFF)
4236 mlevel2[i2] = count3++;
4237 o3 = decode[i] & 0x7F;
4238 i3 = 128*mlevel2[i2] + o3;
4239 mlevel3[i3] = i;
4240 }
4241 return result;
4242}
4243
4244static int
4245encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4246{
4247 struct encoding_map *map = (struct encoding_map*)mapping;
4248 int l1 = c>>11;
4249 int l2 = (c>>7) & 0xF;
4250 int l3 = c & 0x7F;
4251 int i;
4252
4253#ifdef Py_UNICODE_WIDE
4254 if (c > 0xFFFF) {
4255 return -1;
4256 }
4257#endif
4258 if (c == 0)
4259 return 0;
4260 /* level 1*/
4261 i = map->level1[l1];
4262 if (i == 0xFF) {
4263 return -1;
4264 }
4265 /* level 2*/
4266 i = map->level23[16*i+l2];
4267 if (i == 0xFF) {
4268 return -1;
4269 }
4270 /* level 3 */
4271 i = map->level23[16*map->count2 + 128*i + l3];
4272 if (i == 0) {
4273 return -1;
4274 }
4275 return i;
4276}
4277
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278/* Lookup the character ch in the mapping. If the character
4279 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004280 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 PyObject *w = PyInt_FromLong((long)c);
4284 PyObject *x;
4285
4286 if (w == NULL)
4287 return NULL;
4288 x = PyObject_GetItem(mapping, w);
4289 Py_DECREF(w);
4290 if (x == NULL) {
4291 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4292 /* No mapping found means: mapping is undefined. */
4293 PyErr_Clear();
4294 x = Py_None;
4295 Py_INCREF(x);
4296 return x;
4297 } else
4298 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004300 else if (x == Py_None)
4301 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 else if (PyInt_Check(x)) {
4303 long value = PyInt_AS_LONG(x);
4304 if (value < 0 || value > 255) {
4305 PyErr_SetString(PyExc_TypeError,
4306 "character mapping must be in range(256)");
4307 Py_DECREF(x);
4308 return NULL;
4309 }
4310 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312 else if (PyString_Check(x))
4313 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004316 PyErr_Format(PyExc_TypeError,
4317 "character mapping must return integer, None or str8, not %.400s",
4318 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 Py_DECREF(x);
4320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 }
4322}
4323
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004324static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004325charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004326{
Guido van Rossum98297ee2007-11-06 21:34:58 +00004327 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004328 /* exponentially overallocate to minimize reallocations */
4329 if (requiredsize < 2*outsize)
4330 requiredsize = 2*outsize;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004331 if (_PyString_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004332 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004333 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004334}
4335
4336typedef enum charmapencode_result {
4337 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4338}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004339/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004340 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004341 space is available. Return a new reference to the object that
4342 was put in the output buffer, or Py_None, if the mapping was undefined
4343 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004344 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004345static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004346charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004347 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004349 PyObject *rep;
4350 char *outstart;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004351 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004352
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004353 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004354 int res = encoding_map_lookup(c, mapping);
4355 Py_ssize_t requiredsize = *outpos+1;
4356 if (res == -1)
4357 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004358 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004359 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004360 return enc_EXCEPTION;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004361 outstart = PyString_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004362 outstart[(*outpos)++] = (char)res;
4363 return enc_SUCCESS;
4364 }
4365
4366 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004367 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004368 return enc_EXCEPTION;
4369 else if (rep==Py_None) {
4370 Py_DECREF(rep);
4371 return enc_FAILED;
4372 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004373 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004374 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004375 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004376 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004377 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004378 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004380 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4382 }
4383 else {
4384 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004385 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4386 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004387 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004388 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004390 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004392 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 memcpy(outstart + *outpos, repchars, repsize);
4394 *outpos += repsize;
4395 }
4396 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004397 Py_DECREF(rep);
4398 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399}
4400
4401/* handle an error in PyUnicode_EncodeCharmap
4402 Return 0 on success, -1 on error */
4403static
4404int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004405 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004407 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004408 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409{
4410 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004411 Py_ssize_t repsize;
4412 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 Py_UNICODE *uni2;
4414 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004415 Py_ssize_t collstartpos = *inpos;
4416 Py_ssize_t collendpos = *inpos+1;
4417 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 char *encoding = "charmap";
4419 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004420 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422 /* find all unencodable characters */
4423 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004424 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004425 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004426 int res = encoding_map_lookup(p[collendpos], mapping);
4427 if (res != -1)
4428 break;
4429 ++collendpos;
4430 continue;
4431 }
4432
4433 rep = charmapencode_lookup(p[collendpos], mapping);
4434 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004436 else if (rep!=Py_None) {
4437 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 break;
4439 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004440 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 ++collendpos;
4442 }
4443 /* cache callback name lookup
4444 * (if not done yet, i.e. it's the first error) */
4445 if (*known_errorHandler==-1) {
4446 if ((errors==NULL) || (!strcmp(errors, "strict")))
4447 *known_errorHandler = 1;
4448 else if (!strcmp(errors, "replace"))
4449 *known_errorHandler = 2;
4450 else if (!strcmp(errors, "ignore"))
4451 *known_errorHandler = 3;
4452 else if (!strcmp(errors, "xmlcharrefreplace"))
4453 *known_errorHandler = 4;
4454 else
4455 *known_errorHandler = 0;
4456 }
4457 switch (*known_errorHandler) {
4458 case 1: /* strict */
4459 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4460 return -1;
4461 case 2: /* replace */
4462 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4463 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004464 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 return -1;
4466 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004467 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4469 return -1;
4470 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471 }
4472 /* fall through */
4473 case 3: /* ignore */
4474 *inpos = collendpos;
4475 break;
4476 case 4: /* xmlcharrefreplace */
4477 /* generate replacement (temporarily (mis)uses p) */
4478 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4479 char buffer[2+29+1+1];
4480 char *cp;
4481 sprintf(buffer, "&#%d;", (int)p[collpos]);
4482 for (cp = buffer; *cp; ++cp) {
4483 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004484 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004486 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4488 return -1;
4489 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004490 }
4491 }
4492 *inpos = collendpos;
4493 break;
4494 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004495 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004496 encoding, reason, p, size, exceptionObject,
4497 collstartpos, collendpos, &newpos);
4498 if (repunicode == NULL)
4499 return -1;
4500 /* generate replacement */
4501 repsize = PyUnicode_GET_SIZE(repunicode);
4502 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4503 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004504 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 return -1;
4506 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004507 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4510 return -1;
4511 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 }
4513 *inpos = newpos;
4514 Py_DECREF(repunicode);
4515 }
4516 return 0;
4517}
4518
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004520 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521 PyObject *mapping,
4522 const char *errors)
4523{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 /* output object */
4525 PyObject *res = NULL;
4526 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004527 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004529 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 PyObject *errorHandler = NULL;
4531 PyObject *exc = NULL;
4532 /* the following variable is used for caching string comparisons
4533 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4534 * 3=ignore, 4=xmlcharrefreplace */
4535 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536
4537 /* Default to Latin-1 */
4538 if (mapping == NULL)
4539 return PyUnicode_EncodeLatin1(p, size, errors);
4540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 /* allocate enough for a simple encoding without
4542 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004543 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 if (res == NULL)
4545 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004546 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 while (inpos<size) {
4550 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004551 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004552 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004554 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 if (charmap_encoding_error(p, size, &inpos, mapping,
4556 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004557 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004558 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004559 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 else
4563 /* done with this character => adjust input position */
4564 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004566
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567 /* Resize if we allocated to much */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004568 if (respos<PyString_GET_SIZE(res))
4569 _PyString_Resize(&res, respos);
4570
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 Py_XDECREF(exc);
4572 Py_XDECREF(errorHandler);
4573 return res;
4574
4575 onError:
4576 Py_XDECREF(res);
4577 Py_XDECREF(exc);
4578 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 return NULL;
4580}
4581
4582PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4583 PyObject *mapping)
4584{
4585 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4586 PyErr_BadArgument();
4587 return NULL;
4588 }
4589 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4590 PyUnicode_GET_SIZE(unicode),
4591 mapping,
4592 NULL);
4593}
4594
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004595/* create or adjust a UnicodeTranslateError */
4596static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004597 const Py_UNICODE *unicode, Py_ssize_t size,
4598 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004601 if (*exceptionObject == NULL) {
4602 *exceptionObject = PyUnicodeTranslateError_Create(
4603 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 }
4605 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4607 goto onError;
4608 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4609 goto onError;
4610 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4611 goto onError;
4612 return;
4613 onError:
4614 Py_DECREF(*exceptionObject);
4615 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616 }
4617}
4618
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619/* raises a UnicodeTranslateError */
4620static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004621 const Py_UNICODE *unicode, Py_ssize_t size,
4622 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 const char *reason)
4624{
4625 make_translate_exception(exceptionObject,
4626 unicode, size, startpos, endpos, reason);
4627 if (*exceptionObject != NULL)
4628 PyCodec_StrictErrors(*exceptionObject);
4629}
4630
4631/* error handling callback helper:
4632 build arguments, call the callback and check the arguments,
4633 put the result into newpos and return the replacement string, which
4634 has to be freed by the caller */
4635static PyObject *unicode_translate_call_errorhandler(const char *errors,
4636 PyObject **errorHandler,
4637 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004638 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4639 Py_ssize_t startpos, Py_ssize_t endpos,
4640 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004642 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004644 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 PyObject *restuple;
4646 PyObject *resunicode;
4647
4648 if (*errorHandler == NULL) {
4649 *errorHandler = PyCodec_LookupError(errors);
4650 if (*errorHandler == NULL)
4651 return NULL;
4652 }
4653
4654 make_translate_exception(exceptionObject,
4655 unicode, size, startpos, endpos, reason);
4656 if (*exceptionObject == NULL)
4657 return NULL;
4658
4659 restuple = PyObject_CallFunctionObjArgs(
4660 *errorHandler, *exceptionObject, NULL);
4661 if (restuple == NULL)
4662 return NULL;
4663 if (!PyTuple_Check(restuple)) {
4664 PyErr_Format(PyExc_TypeError, &argparse[4]);
4665 Py_DECREF(restuple);
4666 return NULL;
4667 }
4668 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004669 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 Py_DECREF(restuple);
4671 return NULL;
4672 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004673 if (i_newpos<0)
4674 *newpos = size+i_newpos;
4675 else
4676 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004677 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004678 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004679 Py_DECREF(restuple);
4680 return NULL;
4681 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 Py_INCREF(resunicode);
4683 Py_DECREF(restuple);
4684 return resunicode;
4685}
4686
4687/* Lookup the character ch in the mapping and put the result in result,
4688 which must be decrefed by the caller.
4689 Return 0 on success, -1 on error */
4690static
4691int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4692{
4693 PyObject *w = PyInt_FromLong((long)c);
4694 PyObject *x;
4695
4696 if (w == NULL)
4697 return -1;
4698 x = PyObject_GetItem(mapping, w);
4699 Py_DECREF(w);
4700 if (x == NULL) {
4701 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4702 /* No mapping found means: use 1:1 mapping. */
4703 PyErr_Clear();
4704 *result = NULL;
4705 return 0;
4706 } else
4707 return -1;
4708 }
4709 else if (x == Py_None) {
4710 *result = x;
4711 return 0;
4712 }
4713 else if (PyInt_Check(x)) {
4714 long value = PyInt_AS_LONG(x);
4715 long max = PyUnicode_GetMax();
4716 if (value < 0 || value > max) {
4717 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00004718 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719 Py_DECREF(x);
4720 return -1;
4721 }
4722 *result = x;
4723 return 0;
4724 }
4725 else if (PyUnicode_Check(x)) {
4726 *result = x;
4727 return 0;
4728 }
4729 else {
4730 /* wrong return value */
4731 PyErr_SetString(PyExc_TypeError,
4732 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004733 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004734 return -1;
4735 }
4736}
4737/* ensure that *outobj is at least requiredsize characters long,
4738if not reallocate and adjust various state variables.
4739Return 0 on success, -1 on error */
4740static
Walter Dörwald4894c302003-10-24 14:25:28 +00004741int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004742 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004744 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004745 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004746 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004747 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004749 if (requiredsize < 2 * oldsize)
4750 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004751 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004752 return -1;
4753 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 }
4755 return 0;
4756}
4757/* lookup the character, put the result in the output string and adjust
4758 various state variables. Return a new reference to the object that
4759 was put in the output buffer in *result, or Py_None, if the mapping was
4760 undefined (in which case no character was written).
4761 The called must decref result.
4762 Return 0 on success, -1 on error. */
4763static
Walter Dörwald4894c302003-10-24 14:25:28 +00004764int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004765 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004766 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767{
Walter Dörwald4894c302003-10-24 14:25:28 +00004768 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 return -1;
4770 if (*res==NULL) {
4771 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004772 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773 }
4774 else if (*res==Py_None)
4775 ;
4776 else if (PyInt_Check(*res)) {
4777 /* no overflow check, because we know that the space is enough */
4778 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4779 }
4780 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004781 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 if (repsize==1) {
4783 /* no overflow check, because we know that the space is enough */
4784 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4785 }
4786 else if (repsize!=0) {
4787 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004788 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004789 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004790 repsize - 1;
4791 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792 return -1;
4793 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4794 *outp += repsize;
4795 }
4796 }
4797 else
4798 return -1;
4799 return 0;
4800}
4801
4802PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004803 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804 PyObject *mapping,
4805 const char *errors)
4806{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 /* output object */
4808 PyObject *res = NULL;
4809 /* pointers to the beginning and end+1 of input */
4810 const Py_UNICODE *startp = p;
4811 const Py_UNICODE *endp = p + size;
4812 /* pointer into the output */
4813 Py_UNICODE *str;
4814 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004815 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 char *reason = "character maps to <undefined>";
4817 PyObject *errorHandler = NULL;
4818 PyObject *exc = NULL;
4819 /* the following variable is used for caching string comparisons
4820 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4821 * 3=ignore, 4=xmlcharrefreplace */
4822 int known_errorHandler = -1;
4823
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 if (mapping == NULL) {
4825 PyErr_BadArgument();
4826 return NULL;
4827 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828
4829 /* allocate enough for a simple 1:1 translation without
4830 replacements, if we need more, we'll resize */
4831 res = PyUnicode_FromUnicode(NULL, size);
4832 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004833 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004835 return res;
4836 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004838 while (p<endp) {
4839 /* try to encode it */
4840 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004841 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 goto onError;
4844 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004845 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 if (x!=Py_None) /* it worked => adjust input pointer */
4847 ++p;
4848 else { /* untranslatable character */
4849 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004850 Py_ssize_t repsize;
4851 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004852 Py_UNICODE *uni2;
4853 /* startpos for collecting untranslatable chars */
4854 const Py_UNICODE *collstart = p;
4855 const Py_UNICODE *collend = p+1;
4856 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 /* find all untranslatable characters */
4859 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004860 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 goto onError;
4862 Py_XDECREF(x);
4863 if (x!=Py_None)
4864 break;
4865 ++collend;
4866 }
4867 /* cache callback name lookup
4868 * (if not done yet, i.e. it's the first error) */
4869 if (known_errorHandler==-1) {
4870 if ((errors==NULL) || (!strcmp(errors, "strict")))
4871 known_errorHandler = 1;
4872 else if (!strcmp(errors, "replace"))
4873 known_errorHandler = 2;
4874 else if (!strcmp(errors, "ignore"))
4875 known_errorHandler = 3;
4876 else if (!strcmp(errors, "xmlcharrefreplace"))
4877 known_errorHandler = 4;
4878 else
4879 known_errorHandler = 0;
4880 }
4881 switch (known_errorHandler) {
4882 case 1: /* strict */
4883 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4884 goto onError;
4885 case 2: /* replace */
4886 /* No need to check for space, this is a 1:1 replacement */
4887 for (coll = collstart; coll<collend; ++coll)
4888 *str++ = '?';
4889 /* fall through */
4890 case 3: /* ignore */
4891 p = collend;
4892 break;
4893 case 4: /* xmlcharrefreplace */
4894 /* generate replacement (temporarily (mis)uses p) */
4895 for (p = collstart; p < collend; ++p) {
4896 char buffer[2+29+1+1];
4897 char *cp;
4898 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004899 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4901 goto onError;
4902 for (cp = buffer; *cp; ++cp)
4903 *str++ = *cp;
4904 }
4905 p = collend;
4906 break;
4907 default:
4908 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4909 reason, startp, size, &exc,
4910 collstart-startp, collend-startp, &newpos);
4911 if (repunicode == NULL)
4912 goto onError;
4913 /* generate replacement */
4914 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004915 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004916 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4917 Py_DECREF(repunicode);
4918 goto onError;
4919 }
4920 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4921 *str++ = *uni2;
4922 p = startp + newpos;
4923 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 }
4925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004927 /* Resize if we allocated to much */
4928 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004929 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004930 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004931 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004932 }
4933 Py_XDECREF(exc);
4934 Py_XDECREF(errorHandler);
4935 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004937 onError:
4938 Py_XDECREF(res);
4939 Py_XDECREF(exc);
4940 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 return NULL;
4942}
4943
4944PyObject *PyUnicode_Translate(PyObject *str,
4945 PyObject *mapping,
4946 const char *errors)
4947{
4948 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004949
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950 str = PyUnicode_FromObject(str);
4951 if (str == NULL)
4952 goto onError;
4953 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4954 PyUnicode_GET_SIZE(str),
4955 mapping,
4956 errors);
4957 Py_DECREF(str);
4958 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004959
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960 onError:
4961 Py_XDECREF(str);
4962 return NULL;
4963}
Tim Petersced69f82003-09-16 20:30:58 +00004964
Guido van Rossum9e896b32000-04-05 20:11:21 +00004965/* --- Decimal Encoder ---------------------------------------------------- */
4966
4967int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004968 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004969 char *output,
4970 const char *errors)
4971{
4972 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004973 PyObject *errorHandler = NULL;
4974 PyObject *exc = NULL;
4975 const char *encoding = "decimal";
4976 const char *reason = "invalid decimal Unicode string";
4977 /* the following variable is used for caching string comparisons
4978 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4979 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004980
4981 if (output == NULL) {
4982 PyErr_BadArgument();
4983 return -1;
4984 }
4985
4986 p = s;
4987 end = s + length;
4988 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004989 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004990 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004991 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004992 Py_ssize_t repsize;
4993 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 Py_UNICODE *uni2;
4995 Py_UNICODE *collstart;
4996 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004997
Guido van Rossum9e896b32000-04-05 20:11:21 +00004998 if (Py_UNICODE_ISSPACE(ch)) {
4999 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005000 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005001 continue;
5002 }
5003 decimal = Py_UNICODE_TODECIMAL(ch);
5004 if (decimal >= 0) {
5005 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005006 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005007 continue;
5008 }
Guido van Rossumba477042000-04-06 18:18:10 +00005009 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005010 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005011 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005012 continue;
5013 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005014 /* All other characters are considered unencodable */
5015 collstart = p;
5016 collend = p+1;
5017 while (collend < end) {
5018 if ((0 < *collend && *collend < 256) ||
5019 !Py_UNICODE_ISSPACE(*collend) ||
5020 Py_UNICODE_TODECIMAL(*collend))
5021 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005022 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005023 /* cache callback name lookup
5024 * (if not done yet, i.e. it's the first error) */
5025 if (known_errorHandler==-1) {
5026 if ((errors==NULL) || (!strcmp(errors, "strict")))
5027 known_errorHandler = 1;
5028 else if (!strcmp(errors, "replace"))
5029 known_errorHandler = 2;
5030 else if (!strcmp(errors, "ignore"))
5031 known_errorHandler = 3;
5032 else if (!strcmp(errors, "xmlcharrefreplace"))
5033 known_errorHandler = 4;
5034 else
5035 known_errorHandler = 0;
5036 }
5037 switch (known_errorHandler) {
5038 case 1: /* strict */
5039 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5040 goto onError;
5041 case 2: /* replace */
5042 for (p = collstart; p < collend; ++p)
5043 *output++ = '?';
5044 /* fall through */
5045 case 3: /* ignore */
5046 p = collend;
5047 break;
5048 case 4: /* xmlcharrefreplace */
5049 /* generate replacement (temporarily (mis)uses p) */
5050 for (p = collstart; p < collend; ++p)
5051 output += sprintf(output, "&#%d;", (int)*p);
5052 p = collend;
5053 break;
5054 default:
5055 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5056 encoding, reason, s, length, &exc,
5057 collstart-s, collend-s, &newpos);
5058 if (repunicode == NULL)
5059 goto onError;
5060 /* generate replacement */
5061 repsize = PyUnicode_GET_SIZE(repunicode);
5062 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5063 Py_UNICODE ch = *uni2;
5064 if (Py_UNICODE_ISSPACE(ch))
5065 *output++ = ' ';
5066 else {
5067 decimal = Py_UNICODE_TODECIMAL(ch);
5068 if (decimal >= 0)
5069 *output++ = '0' + decimal;
5070 else if (0 < ch && ch < 256)
5071 *output++ = (char)ch;
5072 else {
5073 Py_DECREF(repunicode);
5074 raise_encode_exception(&exc, encoding,
5075 s, length, collstart-s, collend-s, reason);
5076 goto onError;
5077 }
5078 }
5079 }
5080 p = s + newpos;
5081 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005082 }
5083 }
5084 /* 0-terminate the output string */
5085 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005086 Py_XDECREF(exc);
5087 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005088 return 0;
5089
5090 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005091 Py_XDECREF(exc);
5092 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005093 return -1;
5094}
5095
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096/* --- Helpers ------------------------------------------------------------ */
5097
Eric Smith8c663262007-08-25 02:26:07 +00005098#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005099#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005100#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005101/* Include _ParseTupleFinds from find.h */
5102#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005103#include "stringlib/find.h"
5104#include "stringlib/partition.h"
5105
5106/* helper macro to fixup start/end slice values */
5107#define FIX_START_END(obj) \
5108 if (start < 0) \
5109 start += (obj)->length; \
5110 if (start < 0) \
5111 start = 0; \
5112 if (end > (obj)->length) \
5113 end = (obj)->length; \
5114 if (end < 0) \
5115 end += (obj)->length; \
5116 if (end < 0) \
5117 end = 0;
5118
Martin v. Löwis18e16552006-02-15 17:27:45 +00005119Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005120 PyObject *substr,
5121 Py_ssize_t start,
5122 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005124 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005125 PyUnicodeObject* str_obj;
5126 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005127
Thomas Wouters477c8d52006-05-27 19:21:47 +00005128 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5129 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005131 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5132 if (!sub_obj) {
5133 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 return -1;
5135 }
Tim Petersced69f82003-09-16 20:30:58 +00005136
Thomas Wouters477c8d52006-05-27 19:21:47 +00005137 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005138
Thomas Wouters477c8d52006-05-27 19:21:47 +00005139 result = stringlib_count(
5140 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5141 );
5142
5143 Py_DECREF(sub_obj);
5144 Py_DECREF(str_obj);
5145
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 return result;
5147}
5148
Martin v. Löwis18e16552006-02-15 17:27:45 +00005149Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005150 PyObject *sub,
5151 Py_ssize_t start,
5152 Py_ssize_t end,
5153 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005155 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005156
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005158 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005159 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005160 sub = PyUnicode_FromObject(sub);
5161 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005162 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005163 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 }
Tim Petersced69f82003-09-16 20:30:58 +00005165
Thomas Wouters477c8d52006-05-27 19:21:47 +00005166 if (direction > 0)
5167 result = stringlib_find_slice(
5168 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5169 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5170 start, end
5171 );
5172 else
5173 result = stringlib_rfind_slice(
5174 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5175 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5176 start, end
5177 );
5178
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005180 Py_DECREF(sub);
5181
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 return result;
5183}
5184
Tim Petersced69f82003-09-16 20:30:58 +00005185static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186int tailmatch(PyUnicodeObject *self,
5187 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005188 Py_ssize_t start,
5189 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 int direction)
5191{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192 if (substring->length == 0)
5193 return 1;
5194
Thomas Wouters477c8d52006-05-27 19:21:47 +00005195 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196
5197 end -= substring->length;
5198 if (end < start)
5199 return 0;
5200
5201 if (direction > 0) {
5202 if (Py_UNICODE_MATCH(self, end, substring))
5203 return 1;
5204 } else {
5205 if (Py_UNICODE_MATCH(self, start, substring))
5206 return 1;
5207 }
5208
5209 return 0;
5210}
5211
Martin v. Löwis18e16552006-02-15 17:27:45 +00005212Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005214 Py_ssize_t start,
5215 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 int direction)
5217{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005218 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005219
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 str = PyUnicode_FromObject(str);
5221 if (str == NULL)
5222 return -1;
5223 substr = PyUnicode_FromObject(substr);
5224 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005225 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 return -1;
5227 }
Tim Petersced69f82003-09-16 20:30:58 +00005228
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 result = tailmatch((PyUnicodeObject *)str,
5230 (PyUnicodeObject *)substr,
5231 start, end, direction);
5232 Py_DECREF(str);
5233 Py_DECREF(substr);
5234 return result;
5235}
5236
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237/* Apply fixfct filter to the Unicode object self and return a
5238 reference to the modified object */
5239
Tim Petersced69f82003-09-16 20:30:58 +00005240static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241PyObject *fixup(PyUnicodeObject *self,
5242 int (*fixfct)(PyUnicodeObject *s))
5243{
5244
5245 PyUnicodeObject *u;
5246
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005247 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248 if (u == NULL)
5249 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005250
5251 Py_UNICODE_COPY(u->str, self->str, self->length);
5252
Tim Peters7a29bd52001-09-12 03:03:31 +00005253 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 /* fixfct should return TRUE if it modified the buffer. If
5255 FALSE, return a reference to the original buffer instead
5256 (to save space, not time) */
5257 Py_INCREF(self);
5258 Py_DECREF(u);
5259 return (PyObject*) self;
5260 }
5261 return (PyObject*) u;
5262}
5263
Tim Petersced69f82003-09-16 20:30:58 +00005264static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265int fixupper(PyUnicodeObject *self)
5266{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005267 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 Py_UNICODE *s = self->str;
5269 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005270
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 while (len-- > 0) {
5272 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005273
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 ch = Py_UNICODE_TOUPPER(*s);
5275 if (ch != *s) {
5276 status = 1;
5277 *s = ch;
5278 }
5279 s++;
5280 }
5281
5282 return status;
5283}
5284
Tim Petersced69f82003-09-16 20:30:58 +00005285static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286int fixlower(PyUnicodeObject *self)
5287{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005288 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 Py_UNICODE *s = self->str;
5290 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005291
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 while (len-- > 0) {
5293 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005294
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 ch = Py_UNICODE_TOLOWER(*s);
5296 if (ch != *s) {
5297 status = 1;
5298 *s = ch;
5299 }
5300 s++;
5301 }
5302
5303 return status;
5304}
5305
Tim Petersced69f82003-09-16 20:30:58 +00005306static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307int fixswapcase(PyUnicodeObject *self)
5308{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005309 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310 Py_UNICODE *s = self->str;
5311 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005312
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 while (len-- > 0) {
5314 if (Py_UNICODE_ISUPPER(*s)) {
5315 *s = Py_UNICODE_TOLOWER(*s);
5316 status = 1;
5317 } else if (Py_UNICODE_ISLOWER(*s)) {
5318 *s = Py_UNICODE_TOUPPER(*s);
5319 status = 1;
5320 }
5321 s++;
5322 }
5323
5324 return status;
5325}
5326
Tim Petersced69f82003-09-16 20:30:58 +00005327static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328int fixcapitalize(PyUnicodeObject *self)
5329{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005330 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005331 Py_UNICODE *s = self->str;
5332 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005333
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005334 if (len == 0)
5335 return 0;
5336 if (Py_UNICODE_ISLOWER(*s)) {
5337 *s = Py_UNICODE_TOUPPER(*s);
5338 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005340 s++;
5341 while (--len > 0) {
5342 if (Py_UNICODE_ISUPPER(*s)) {
5343 *s = Py_UNICODE_TOLOWER(*s);
5344 status = 1;
5345 }
5346 s++;
5347 }
5348 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349}
5350
5351static
5352int fixtitle(PyUnicodeObject *self)
5353{
5354 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5355 register Py_UNICODE *e;
5356 int previous_is_cased;
5357
5358 /* Shortcut for single character strings */
5359 if (PyUnicode_GET_SIZE(self) == 1) {
5360 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5361 if (*p != ch) {
5362 *p = ch;
5363 return 1;
5364 }
5365 else
5366 return 0;
5367 }
Tim Petersced69f82003-09-16 20:30:58 +00005368
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 e = p + PyUnicode_GET_SIZE(self);
5370 previous_is_cased = 0;
5371 for (; p < e; p++) {
5372 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005373
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 if (previous_is_cased)
5375 *p = Py_UNICODE_TOLOWER(ch);
5376 else
5377 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005378
5379 if (Py_UNICODE_ISLOWER(ch) ||
5380 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 Py_UNICODE_ISTITLE(ch))
5382 previous_is_cased = 1;
5383 else
5384 previous_is_cased = 0;
5385 }
5386 return 1;
5387}
5388
Tim Peters8ce9f162004-08-27 01:49:32 +00005389PyObject *
5390PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391{
Tim Peters8ce9f162004-08-27 01:49:32 +00005392 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005393 const Py_UNICODE blank = ' ';
5394 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005395 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005396 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005397 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5398 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005399 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5400 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005401 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005402 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005403 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404
Tim Peters05eba1f2004-08-27 21:32:02 +00005405 fseq = PySequence_Fast(seq, "");
5406 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005407 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005408 }
5409
Tim Peters91879ab2004-08-27 22:35:44 +00005410 /* Grrrr. A codec may be invoked to convert str objects to
5411 * Unicode, and so it's possible to call back into Python code
5412 * during PyUnicode_FromObject(), and so it's possible for a sick
5413 * codec to change the size of fseq (if seq is a list). Therefore
5414 * we have to keep refetching the size -- can't assume seqlen
5415 * is invariant.
5416 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005417 seqlen = PySequence_Fast_GET_SIZE(fseq);
5418 /* If empty sequence, return u"". */
5419 if (seqlen == 0) {
5420 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5421 goto Done;
5422 }
5423 /* If singleton sequence with an exact Unicode, return that. */
5424 if (seqlen == 1) {
5425 item = PySequence_Fast_GET_ITEM(fseq, 0);
5426 if (PyUnicode_CheckExact(item)) {
5427 Py_INCREF(item);
5428 res = (PyUnicodeObject *)item;
5429 goto Done;
5430 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005431 }
5432
Tim Peters05eba1f2004-08-27 21:32:02 +00005433 /* At least two items to join, or one that isn't exact Unicode. */
5434 if (seqlen > 1) {
5435 /* Set up sep and seplen -- they're needed. */
5436 if (separator == NULL) {
5437 sep = &blank;
5438 seplen = 1;
5439 }
5440 else {
5441 internal_separator = PyUnicode_FromObject(separator);
5442 if (internal_separator == NULL)
5443 goto onError;
5444 sep = PyUnicode_AS_UNICODE(internal_separator);
5445 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005446 /* In case PyUnicode_FromObject() mutated seq. */
5447 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005448 }
5449 }
5450
5451 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005452 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005453 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005454 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005455 res_p = PyUnicode_AS_UNICODE(res);
5456 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005457
Tim Peters05eba1f2004-08-27 21:32:02 +00005458 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005459 Py_ssize_t itemlen;
5460 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005461
5462 item = PySequence_Fast_GET_ITEM(fseq, i);
5463 /* Convert item to Unicode. */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005464 if (!PyUnicode_Check(item)) {
5465 PyErr_Format(PyExc_TypeError,
5466 "sequence item %zd: expected str instance,"
5467 " %.80s found",
5468 i, Py_Type(item)->tp_name);
5469 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005470 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005471 item = PyUnicode_FromObject(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005472 if (item == NULL)
5473 goto onError;
5474 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005475
Tim Peters91879ab2004-08-27 22:35:44 +00005476 /* In case PyUnicode_FromObject() mutated seq. */
5477 seqlen = PySequence_Fast_GET_SIZE(fseq);
5478
Tim Peters8ce9f162004-08-27 01:49:32 +00005479 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005481 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005482 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005483 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005484 if (i < seqlen - 1) {
5485 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005486 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005487 goto Overflow;
5488 }
5489 if (new_res_used > res_alloc) {
5490 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005491 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005492 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005493 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005494 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005495 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005496 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005497 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005499 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005500 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005502
5503 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005504 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005505 res_p += itemlen;
5506 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005507 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005508 res_p += seplen;
5509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005511 res_used = new_res_used;
5512 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005513
Tim Peters05eba1f2004-08-27 21:32:02 +00005514 /* Shrink res to match the used area; this probably can't fail,
5515 * but it's cheap to check.
5516 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005517 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005518 goto onError;
5519
5520 Done:
5521 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005522 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 return (PyObject *)res;
5524
Tim Peters8ce9f162004-08-27 01:49:32 +00005525 Overflow:
5526 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005527 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005528 Py_DECREF(item);
5529 /* fall through */
5530
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005532 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005533 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005534 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535 return NULL;
5536}
5537
Tim Petersced69f82003-09-16 20:30:58 +00005538static
5539PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005540 Py_ssize_t left,
5541 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 Py_UNICODE fill)
5543{
5544 PyUnicodeObject *u;
5545
5546 if (left < 0)
5547 left = 0;
5548 if (right < 0)
5549 right = 0;
5550
Tim Peters7a29bd52001-09-12 03:03:31 +00005551 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 Py_INCREF(self);
5553 return self;
5554 }
5555
5556 u = _PyUnicode_New(left + self->length + right);
5557 if (u) {
5558 if (left)
5559 Py_UNICODE_FILL(u->str, fill, left);
5560 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5561 if (right)
5562 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5563 }
5564
5565 return u;
5566}
5567
5568#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005569 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 if (!str) \
5571 goto onError; \
5572 if (PyList_Append(list, str)) { \
5573 Py_DECREF(str); \
5574 goto onError; \
5575 } \
5576 else \
5577 Py_DECREF(str);
5578
5579static
5580PyObject *split_whitespace(PyUnicodeObject *self,
5581 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005582 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005584 register Py_ssize_t i;
5585 register Py_ssize_t j;
5586 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 PyObject *str;
5588
5589 for (i = j = 0; i < len; ) {
5590 /* find a token */
5591 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5592 i++;
5593 j = i;
5594 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5595 i++;
5596 if (j < i) {
5597 if (maxcount-- <= 0)
5598 break;
5599 SPLIT_APPEND(self->str, j, i);
5600 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5601 i++;
5602 j = i;
5603 }
5604 }
5605 if (j < len) {
5606 SPLIT_APPEND(self->str, j, len);
5607 }
5608 return list;
5609
5610 onError:
5611 Py_DECREF(list);
5612 return NULL;
5613}
5614
5615PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005616 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005618 register Py_ssize_t i;
5619 register Py_ssize_t j;
5620 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 PyObject *list;
5622 PyObject *str;
5623 Py_UNICODE *data;
5624
5625 string = PyUnicode_FromObject(string);
5626 if (string == NULL)
5627 return NULL;
5628 data = PyUnicode_AS_UNICODE(string);
5629 len = PyUnicode_GET_SIZE(string);
5630
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 list = PyList_New(0);
5632 if (!list)
5633 goto onError;
5634
5635 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005636 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005637
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005639 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641
5642 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005643 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 if (i < len) {
5645 if (data[i] == '\r' && i + 1 < len &&
5646 data[i+1] == '\n')
5647 i += 2;
5648 else
5649 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005650 if (keepends)
5651 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 }
Guido van Rossum86662912000-04-11 15:38:46 +00005653 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 j = i;
5655 }
5656 if (j < len) {
5657 SPLIT_APPEND(data, j, len);
5658 }
5659
5660 Py_DECREF(string);
5661 return list;
5662
5663 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005664 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 Py_DECREF(string);
5666 return NULL;
5667}
5668
Tim Petersced69f82003-09-16 20:30:58 +00005669static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670PyObject *split_char(PyUnicodeObject *self,
5671 PyObject *list,
5672 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005673 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005675 register Py_ssize_t i;
5676 register Py_ssize_t j;
5677 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 PyObject *str;
5679
5680 for (i = j = 0; i < len; ) {
5681 if (self->str[i] == ch) {
5682 if (maxcount-- <= 0)
5683 break;
5684 SPLIT_APPEND(self->str, j, i);
5685 i = j = i + 1;
5686 } else
5687 i++;
5688 }
5689 if (j <= len) {
5690 SPLIT_APPEND(self->str, j, len);
5691 }
5692 return list;
5693
5694 onError:
5695 Py_DECREF(list);
5696 return NULL;
5697}
5698
Tim Petersced69f82003-09-16 20:30:58 +00005699static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700PyObject *split_substring(PyUnicodeObject *self,
5701 PyObject *list,
5702 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005703 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005705 register Py_ssize_t i;
5706 register Py_ssize_t j;
5707 Py_ssize_t len = self->length;
5708 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 PyObject *str;
5710
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005711 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 if (Py_UNICODE_MATCH(self, i, substring)) {
5713 if (maxcount-- <= 0)
5714 break;
5715 SPLIT_APPEND(self->str, j, i);
5716 i = j = i + sublen;
5717 } else
5718 i++;
5719 }
5720 if (j <= len) {
5721 SPLIT_APPEND(self->str, j, len);
5722 }
5723 return list;
5724
5725 onError:
5726 Py_DECREF(list);
5727 return NULL;
5728}
5729
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005730static
5731PyObject *rsplit_whitespace(PyUnicodeObject *self,
5732 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005733 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005734{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005735 register Py_ssize_t i;
5736 register Py_ssize_t j;
5737 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005738 PyObject *str;
5739
5740 for (i = j = len - 1; i >= 0; ) {
5741 /* find a token */
5742 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5743 i--;
5744 j = i;
5745 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5746 i--;
5747 if (j > i) {
5748 if (maxcount-- <= 0)
5749 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005750 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005751 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5752 i--;
5753 j = i;
5754 }
5755 }
5756 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005757 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005758 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005759 if (PyList_Reverse(list) < 0)
5760 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005761 return list;
5762
5763 onError:
5764 Py_DECREF(list);
5765 return NULL;
5766}
5767
5768static
5769PyObject *rsplit_char(PyUnicodeObject *self,
5770 PyObject *list,
5771 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005772 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005773{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005774 register Py_ssize_t i;
5775 register Py_ssize_t j;
5776 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005777 PyObject *str;
5778
5779 for (i = j = len - 1; i >= 0; ) {
5780 if (self->str[i] == ch) {
5781 if (maxcount-- <= 0)
5782 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005783 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005784 j = i = i - 1;
5785 } else
5786 i--;
5787 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005788 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005789 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005790 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005791 if (PyList_Reverse(list) < 0)
5792 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005793 return list;
5794
5795 onError:
5796 Py_DECREF(list);
5797 return NULL;
5798}
5799
5800static
5801PyObject *rsplit_substring(PyUnicodeObject *self,
5802 PyObject *list,
5803 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005804 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005805{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005806 register Py_ssize_t i;
5807 register Py_ssize_t j;
5808 Py_ssize_t len = self->length;
5809 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005810 PyObject *str;
5811
5812 for (i = len - sublen, j = len; i >= 0; ) {
5813 if (Py_UNICODE_MATCH(self, i, substring)) {
5814 if (maxcount-- <= 0)
5815 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005816 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005817 j = i;
5818 i -= sublen;
5819 } else
5820 i--;
5821 }
5822 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005823 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005824 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005825 if (PyList_Reverse(list) < 0)
5826 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005827 return list;
5828
5829 onError:
5830 Py_DECREF(list);
5831 return NULL;
5832}
5833
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834#undef SPLIT_APPEND
5835
5836static
5837PyObject *split(PyUnicodeObject *self,
5838 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005839 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840{
5841 PyObject *list;
5842
5843 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005844 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845
5846 list = PyList_New(0);
5847 if (!list)
5848 return NULL;
5849
5850 if (substring == NULL)
5851 return split_whitespace(self,list,maxcount);
5852
5853 else if (substring->length == 1)
5854 return split_char(self,list,substring->str[0],maxcount);
5855
5856 else if (substring->length == 0) {
5857 Py_DECREF(list);
5858 PyErr_SetString(PyExc_ValueError, "empty separator");
5859 return NULL;
5860 }
5861 else
5862 return split_substring(self,list,substring,maxcount);
5863}
5864
Tim Petersced69f82003-09-16 20:30:58 +00005865static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005866PyObject *rsplit(PyUnicodeObject *self,
5867 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005869{
5870 PyObject *list;
5871
5872 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005873 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005874
5875 list = PyList_New(0);
5876 if (!list)
5877 return NULL;
5878
5879 if (substring == NULL)
5880 return rsplit_whitespace(self,list,maxcount);
5881
5882 else if (substring->length == 1)
5883 return rsplit_char(self,list,substring->str[0],maxcount);
5884
5885 else if (substring->length == 0) {
5886 Py_DECREF(list);
5887 PyErr_SetString(PyExc_ValueError, "empty separator");
5888 return NULL;
5889 }
5890 else
5891 return rsplit_substring(self,list,substring,maxcount);
5892}
5893
5894static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895PyObject *replace(PyUnicodeObject *self,
5896 PyUnicodeObject *str1,
5897 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005898 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899{
5900 PyUnicodeObject *u;
5901
5902 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005903 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904
Thomas Wouters477c8d52006-05-27 19:21:47 +00005905 if (str1->length == str2->length) {
5906 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005907 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005908 if (str1->length == 1) {
5909 /* replace characters */
5910 Py_UNICODE u1, u2;
5911 if (!findchar(self->str, self->length, str1->str[0]))
5912 goto nothing;
5913 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5914 if (!u)
5915 return NULL;
5916 Py_UNICODE_COPY(u->str, self->str, self->length);
5917 u1 = str1->str[0];
5918 u2 = str2->str[0];
5919 for (i = 0; i < u->length; i++)
5920 if (u->str[i] == u1) {
5921 if (--maxcount < 0)
5922 break;
5923 u->str[i] = u2;
5924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005926 i = fastsearch(
5927 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005929 if (i < 0)
5930 goto nothing;
5931 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5932 if (!u)
5933 return NULL;
5934 Py_UNICODE_COPY(u->str, self->str, self->length);
5935 while (i <= self->length - str1->length)
5936 if (Py_UNICODE_MATCH(self, i, str1)) {
5937 if (--maxcount < 0)
5938 break;
5939 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5940 i += str1->length;
5941 } else
5942 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005945
5946 Py_ssize_t n, i, j, e;
5947 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 Py_UNICODE *p;
5949
5950 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005951 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 if (n > maxcount)
5953 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005954 if (n == 0)
5955 goto nothing;
5956 /* new_size = self->length + n * (str2->length - str1->length)); */
5957 delta = (str2->length - str1->length);
5958 if (delta == 0) {
5959 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005961 product = n * (str2->length - str1->length);
5962 if ((product / (str2->length - str1->length)) != n) {
5963 PyErr_SetString(PyExc_OverflowError,
5964 "replace string is too long");
5965 return NULL;
5966 }
5967 new_size = self->length + product;
5968 if (new_size < 0) {
5969 PyErr_SetString(PyExc_OverflowError,
5970 "replace string is too long");
5971 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 }
5973 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005974 u = _PyUnicode_New(new_size);
5975 if (!u)
5976 return NULL;
5977 i = 0;
5978 p = u->str;
5979 e = self->length - str1->length;
5980 if (str1->length > 0) {
5981 while (n-- > 0) {
5982 /* look for next match */
5983 j = i;
5984 while (j <= e) {
5985 if (Py_UNICODE_MATCH(self, j, str1))
5986 break;
5987 j++;
5988 }
5989 if (j > i) {
5990 if (j > e)
5991 break;
5992 /* copy unchanged part [i:j] */
5993 Py_UNICODE_COPY(p, self->str+i, j-i);
5994 p += j - i;
5995 }
5996 /* copy substitution string */
5997 if (str2->length > 0) {
5998 Py_UNICODE_COPY(p, str2->str, str2->length);
5999 p += str2->length;
6000 }
6001 i = j + str1->length;
6002 }
6003 if (i < self->length)
6004 /* copy tail [i:] */
6005 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6006 } else {
6007 /* interleave */
6008 while (n > 0) {
6009 Py_UNICODE_COPY(p, str2->str, str2->length);
6010 p += str2->length;
6011 if (--n <= 0)
6012 break;
6013 *p++ = self->str[i++];
6014 }
6015 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006019
6020nothing:
6021 /* nothing to replace; return original string (when possible) */
6022 if (PyUnicode_CheckExact(self)) {
6023 Py_INCREF(self);
6024 return (PyObject *) self;
6025 }
6026 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027}
6028
6029/* --- Unicode Object Methods --------------------------------------------- */
6030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006031PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032"S.title() -> unicode\n\
6033\n\
6034Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006035characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036
6037static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006038unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 return fixup(self, fixtitle);
6041}
6042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006043PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044"S.capitalize() -> unicode\n\
6045\n\
6046Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006047have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048
6049static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006050unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 return fixup(self, fixcapitalize);
6053}
6054
6055#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006056PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057"S.capwords() -> unicode\n\
6058\n\
6059Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006060normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061
6062static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006063unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064{
6065 PyObject *list;
6066 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006067 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 /* Split into words */
6070 list = split(self, NULL, -1);
6071 if (!list)
6072 return NULL;
6073
6074 /* Capitalize each word */
6075 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6076 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6077 fixcapitalize);
6078 if (item == NULL)
6079 goto onError;
6080 Py_DECREF(PyList_GET_ITEM(list, i));
6081 PyList_SET_ITEM(list, i, item);
6082 }
6083
6084 /* Join the words to form a new string */
6085 item = PyUnicode_Join(NULL, list);
6086
6087onError:
6088 Py_DECREF(list);
6089 return (PyObject *)item;
6090}
6091#endif
6092
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006093/* Argument converter. Coerces to a single unicode character */
6094
6095static int
6096convert_uc(PyObject *obj, void *addr)
6097{
6098 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6099 PyObject *uniobj;
6100 Py_UNICODE *unistr;
6101
6102 uniobj = PyUnicode_FromObject(obj);
6103 if (uniobj == NULL) {
6104 PyErr_SetString(PyExc_TypeError,
6105 "The fill character cannot be converted to Unicode");
6106 return 0;
6107 }
6108 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6109 PyErr_SetString(PyExc_TypeError,
6110 "The fill character must be exactly one character long");
6111 Py_DECREF(uniobj);
6112 return 0;
6113 }
6114 unistr = PyUnicode_AS_UNICODE(uniobj);
6115 *fillcharloc = unistr[0];
6116 Py_DECREF(uniobj);
6117 return 1;
6118}
6119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006120PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006121"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006123Return S centered in a Unicode string of length width. Padding is\n\
6124done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125
6126static PyObject *
6127unicode_center(PyUnicodeObject *self, PyObject *args)
6128{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006129 Py_ssize_t marg, left;
6130 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006131 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132
Thomas Woutersde017742006-02-16 19:34:37 +00006133 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 return NULL;
6135
Tim Peters7a29bd52001-09-12 03:03:31 +00006136 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 Py_INCREF(self);
6138 return (PyObject*) self;
6139 }
6140
6141 marg = width - self->length;
6142 left = marg / 2 + (marg & width & 1);
6143
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006144 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145}
6146
Marc-André Lemburge5034372000-08-08 08:04:29 +00006147#if 0
6148
6149/* This code should go into some future Unicode collation support
6150 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006151 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006152
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006153/* speedy UTF-16 code point order comparison */
6154/* gleaned from: */
6155/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6156
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006157static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006158{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006159 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006160 0, 0, 0, 0, 0, 0, 0, 0,
6161 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006162 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006163};
6164
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165static int
6166unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6167{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006168 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006169
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 Py_UNICODE *s1 = str1->str;
6171 Py_UNICODE *s2 = str2->str;
6172
6173 len1 = str1->length;
6174 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006175
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006177 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006178
6179 c1 = *s1++;
6180 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006181
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006182 if (c1 > (1<<11) * 26)
6183 c1 += utf16Fixup[c1>>11];
6184 if (c2 > (1<<11) * 26)
6185 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006186 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006187
6188 if (c1 != c2)
6189 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006190
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006191 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 }
6193
6194 return (len1 < len2) ? -1 : (len1 != len2);
6195}
6196
Marc-André Lemburge5034372000-08-08 08:04:29 +00006197#else
6198
6199static int
6200unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6201{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006202 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006203
6204 Py_UNICODE *s1 = str1->str;
6205 Py_UNICODE *s2 = str2->str;
6206
6207 len1 = str1->length;
6208 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006209
Marc-André Lemburge5034372000-08-08 08:04:29 +00006210 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006211 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006212
Fredrik Lundh45714e92001-06-26 16:39:36 +00006213 c1 = *s1++;
6214 c2 = *s2++;
6215
6216 if (c1 != c2)
6217 return (c1 < c2) ? -1 : 1;
6218
Marc-André Lemburge5034372000-08-08 08:04:29 +00006219 len1--; len2--;
6220 }
6221
6222 return (len1 < len2) ? -1 : (len1 != len2);
6223}
6224
6225#endif
6226
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227int PyUnicode_Compare(PyObject *left,
6228 PyObject *right)
6229{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006230 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6231 return unicode_compare((PyUnicodeObject *)left,
6232 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006233 PyErr_Format(PyExc_TypeError,
6234 "Can't compare %.100s and %.100s",
6235 left->ob_type->tp_name,
6236 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 return -1;
6238}
6239
Martin v. Löwis5b222132007-06-10 09:51:05 +00006240int
6241PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6242{
6243 int i;
6244 Py_UNICODE *id;
6245 assert(PyUnicode_Check(uni));
6246 id = PyUnicode_AS_UNICODE(uni);
6247 /* Compare Unicode string and source character set string */
6248 for (i = 0; id[i] && str[i]; i++)
6249 if (id[i] != str[i])
6250 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6251 if (id[i])
6252 return 1; /* uni is longer */
6253 if (str[i])
6254 return -1; /* str is longer */
6255 return 0;
6256}
6257
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006258PyObject *PyUnicode_RichCompare(PyObject *left,
6259 PyObject *right,
6260 int op)
6261{
6262 int result;
6263
6264 result = PyUnicode_Compare(left, right);
6265 if (result == -1 && PyErr_Occurred())
6266 goto onError;
6267
6268 /* Convert the return value to a Boolean */
6269 switch (op) {
6270 case Py_EQ:
6271 result = (result == 0);
6272 break;
6273 case Py_NE:
6274 result = (result != 0);
6275 break;
6276 case Py_LE:
6277 result = (result <= 0);
6278 break;
6279 case Py_GE:
6280 result = (result >= 0);
6281 break;
6282 case Py_LT:
6283 result = (result == -1);
6284 break;
6285 case Py_GT:
6286 result = (result == 1);
6287 break;
6288 }
6289 return PyBool_FromLong(result);
6290
6291 onError:
6292
6293 /* Standard case
6294
6295 Type errors mean that PyUnicode_FromObject() could not convert
6296 one of the arguments (usually the right hand side) to Unicode,
6297 ie. we can't handle the comparison request. However, it is
6298 possible that the other object knows a comparison method, which
6299 is why we return Py_NotImplemented to give the other object a
6300 chance.
6301
6302 */
6303 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6304 PyErr_Clear();
6305 Py_INCREF(Py_NotImplemented);
6306 return Py_NotImplemented;
6307 }
6308 if (op != Py_EQ && op != Py_NE)
6309 return NULL;
6310
6311 /* Equality comparison.
6312
6313 This is a special case: we silence any PyExc_UnicodeDecodeError
6314 and instead turn it into a PyErr_UnicodeWarning.
6315
6316 */
6317 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6318 return NULL;
6319 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006320 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6321 (op == Py_EQ) ?
6322 "Unicode equal comparison "
6323 "failed to convert both arguments to Unicode - "
6324 "interpreting them as being unequal"
6325 :
6326 "Unicode unequal comparison "
6327 "failed to convert both arguments to Unicode - "
6328 "interpreting them as being unequal",
6329 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006330 return NULL;
6331 result = (op == Py_NE);
6332 return PyBool_FromLong(result);
6333}
6334
Guido van Rossum403d68b2000-03-13 15:55:09 +00006335int PyUnicode_Contains(PyObject *container,
6336 PyObject *element)
6337{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006338 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006339 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006340
6341 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006342 sub = PyUnicode_FromObject(element);
6343 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006344 PyErr_Format(PyExc_TypeError,
6345 "'in <string>' requires string as left operand, not %s",
6346 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006347 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006348 }
6349
Thomas Wouters477c8d52006-05-27 19:21:47 +00006350 str = PyUnicode_FromObject(container);
6351 if (!str) {
6352 Py_DECREF(sub);
6353 return -1;
6354 }
6355
6356 result = stringlib_contains_obj(str, sub);
6357
6358 Py_DECREF(str);
6359 Py_DECREF(sub);
6360
Guido van Rossum403d68b2000-03-13 15:55:09 +00006361 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006362}
6363
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364/* Concat to string or Unicode object giving a new Unicode object. */
6365
6366PyObject *PyUnicode_Concat(PyObject *left,
6367 PyObject *right)
6368{
6369 PyUnicodeObject *u = NULL, *v = NULL, *w;
6370
6371 /* Coerce the two arguments */
6372 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6373 if (u == NULL)
6374 goto onError;
6375 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6376 if (v == NULL)
6377 goto onError;
6378
6379 /* Shortcuts */
6380 if (v == unicode_empty) {
6381 Py_DECREF(v);
6382 return (PyObject *)u;
6383 }
6384 if (u == unicode_empty) {
6385 Py_DECREF(u);
6386 return (PyObject *)v;
6387 }
6388
6389 /* Concat the two Unicode strings */
6390 w = _PyUnicode_New(u->length + v->length);
6391 if (w == NULL)
6392 goto onError;
6393 Py_UNICODE_COPY(w->str, u->str, u->length);
6394 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6395
6396 Py_DECREF(u);
6397 Py_DECREF(v);
6398 return (PyObject *)w;
6399
6400onError:
6401 Py_XDECREF(u);
6402 Py_XDECREF(v);
6403 return NULL;
6404}
6405
Walter Dörwald1ab83302007-05-18 17:15:44 +00006406void
6407PyUnicode_Append(PyObject **pleft, PyObject *right)
6408{
6409 PyObject *new;
6410 if (*pleft == NULL)
6411 return;
6412 if (right == NULL || !PyUnicode_Check(*pleft)) {
6413 Py_DECREF(*pleft);
6414 *pleft = NULL;
6415 return;
6416 }
6417 new = PyUnicode_Concat(*pleft, right);
6418 Py_DECREF(*pleft);
6419 *pleft = new;
6420}
6421
6422void
6423PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6424{
6425 PyUnicode_Append(pleft, right);
6426 Py_XDECREF(right);
6427}
6428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006429PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430"S.count(sub[, start[, end]]) -> int\n\
6431\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006432Return the number of non-overlapping occurrences of substring sub in\n\
6433Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006434interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435
6436static PyObject *
6437unicode_count(PyUnicodeObject *self, PyObject *args)
6438{
6439 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006440 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006441 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 PyObject *result;
6443
Guido van Rossumb8872e62000-05-09 14:14:27 +00006444 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6445 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 return NULL;
6447
6448 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006449 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 if (substring == NULL)
6451 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006452
Thomas Wouters477c8d52006-05-27 19:21:47 +00006453 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454
Thomas Wouters477c8d52006-05-27 19:21:47 +00006455 result = PyInt_FromSsize_t(
6456 stringlib_count(self->str + start, end - start,
6457 substring->str, substring->length)
6458 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459
6460 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006461
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 return result;
6463}
6464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006465PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006466"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006468Encodes S using the codec registered for encoding. encoding defaults\n\
6469to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006470handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006471a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6472'xmlcharrefreplace' as well as any other name registered with\n\
6473codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474
6475static PyObject *
6476unicode_encode(PyUnicodeObject *self, PyObject *args)
6477{
6478 char *encoding = NULL;
6479 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006480 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006481
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6483 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006484 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006485 if (v == NULL)
6486 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00006487 if (!PyString_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006488 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006489 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006490 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006491 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006492 Py_DECREF(v);
6493 return NULL;
6494 }
6495 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006496
6497 onError:
6498 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006499}
6500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006501PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502"S.expandtabs([tabsize]) -> unicode\n\
6503\n\
6504Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006505If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506
6507static PyObject*
6508unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6509{
6510 Py_UNICODE *e;
6511 Py_UNICODE *p;
6512 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006513 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 PyUnicodeObject *u;
6515 int tabsize = 8;
6516
6517 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6518 return NULL;
6519
Thomas Wouters7e474022000-07-16 12:04:32 +00006520 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006521 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 e = self->str + self->length;
6523 for (p = self->str; p < e; p++)
6524 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006525 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006527 if (old_j > j) {
6528 PyErr_SetString(PyExc_OverflowError,
6529 "new string is too long");
6530 return NULL;
6531 }
6532 old_j = j;
6533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 }
6535 else {
6536 j++;
6537 if (*p == '\n' || *p == '\r') {
6538 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006539 old_j = j = 0;
6540 if (i < 0) {
6541 PyErr_SetString(PyExc_OverflowError,
6542 "new string is too long");
6543 return NULL;
6544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 }
6546 }
6547
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006548 if ((i + j) < 0) {
6549 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6550 return NULL;
6551 }
6552
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 /* Second pass: create output string and fill it */
6554 u = _PyUnicode_New(i + j);
6555 if (!u)
6556 return NULL;
6557
6558 j = 0;
6559 q = u->str;
6560
6561 for (p = self->str; p < e; p++)
6562 if (*p == '\t') {
6563 if (tabsize > 0) {
6564 i = tabsize - (j % tabsize);
6565 j += i;
6566 while (i--)
6567 *q++ = ' ';
6568 }
6569 }
6570 else {
6571 j++;
6572 *q++ = *p;
6573 if (*p == '\n' || *p == '\r')
6574 j = 0;
6575 }
6576
6577 return (PyObject*) u;
6578}
6579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006580PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581"S.find(sub [,start [,end]]) -> int\n\
6582\n\
6583Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006584such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585arguments start and end are interpreted as in slice notation.\n\
6586\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006587Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588
6589static PyObject *
6590unicode_find(PyUnicodeObject *self, PyObject *args)
6591{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006592 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006593 Py_ssize_t start;
6594 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006595 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
Christian Heimes9cd17752007-11-18 19:35:23 +00006597 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599
Thomas Wouters477c8d52006-05-27 19:21:47 +00006600 result = stringlib_find_slice(
6601 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6602 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6603 start, end
6604 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605
6606 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006607
6608 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609}
6610
6611static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006612unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613{
6614 if (index < 0 || index >= self->length) {
6615 PyErr_SetString(PyExc_IndexError, "string index out of range");
6616 return NULL;
6617 }
6618
6619 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6620}
6621
Guido van Rossumc2504932007-09-18 19:42:40 +00006622/* Believe it or not, this produces the same value for ASCII strings
6623 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00006625unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626{
Guido van Rossumc2504932007-09-18 19:42:40 +00006627 Py_ssize_t len;
6628 Py_UNICODE *p;
6629 long x;
6630
6631 if (self->hash != -1)
6632 return self->hash;
6633 len = Py_Size(self);
6634 p = self->str;
6635 x = *p << 7;
6636 while (--len >= 0)
6637 x = (1000003*x) ^ *p++;
6638 x ^= Py_Size(self);
6639 if (x == -1)
6640 x = -2;
6641 self->hash = x;
6642 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643}
6644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006645PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646"S.index(sub [,start [,end]]) -> int\n\
6647\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006648Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649
6650static PyObject *
6651unicode_index(PyUnicodeObject *self, PyObject *args)
6652{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006653 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006654 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00006655 Py_ssize_t start;
6656 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657
Christian Heimes9cd17752007-11-18 19:35:23 +00006658 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660
Thomas Wouters477c8d52006-05-27 19:21:47 +00006661 result = stringlib_find_slice(
6662 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6663 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6664 start, end
6665 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666
6667 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006668
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 if (result < 0) {
6670 PyErr_SetString(PyExc_ValueError, "substring not found");
6671 return NULL;
6672 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006673
Martin v. Löwis18e16552006-02-15 17:27:45 +00006674 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675}
6676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006677PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006678"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006680Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006681at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
6683static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006684unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685{
6686 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6687 register const Py_UNICODE *e;
6688 int cased;
6689
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 /* Shortcut for single character strings */
6691 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006692 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006694 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006695 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006696 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 e = p + PyUnicode_GET_SIZE(self);
6699 cased = 0;
6700 for (; p < e; p++) {
6701 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006702
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006704 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 else if (!cased && Py_UNICODE_ISLOWER(ch))
6706 cased = 1;
6707 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006708 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709}
6710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006711PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006712"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006714Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006715at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716
6717static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006718unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719{
6720 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6721 register const Py_UNICODE *e;
6722 int cased;
6723
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 /* Shortcut for single character strings */
6725 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006726 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006728 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006729 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006730 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006731
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732 e = p + PyUnicode_GET_SIZE(self);
6733 cased = 0;
6734 for (; p < e; p++) {
6735 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006736
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006738 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 else if (!cased && Py_UNICODE_ISUPPER(ch))
6740 cased = 1;
6741 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006742 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743}
6744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006745PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006746"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006748Return True if S is a titlecased string and there is at least one\n\
6749character in S, i.e. upper- and titlecase characters may only\n\
6750follow uncased characters and lowercase characters only cased ones.\n\
6751Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752
6753static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006754unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755{
6756 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6757 register const Py_UNICODE *e;
6758 int cased, previous_is_cased;
6759
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 /* Shortcut for single character strings */
6761 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006762 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6763 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006765 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006766 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006767 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006768
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 e = p + PyUnicode_GET_SIZE(self);
6770 cased = 0;
6771 previous_is_cased = 0;
6772 for (; p < e; p++) {
6773 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006774
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6776 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006777 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 previous_is_cased = 1;
6779 cased = 1;
6780 }
6781 else if (Py_UNICODE_ISLOWER(ch)) {
6782 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006783 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 previous_is_cased = 1;
6785 cased = 1;
6786 }
6787 else
6788 previous_is_cased = 0;
6789 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006790 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791}
6792
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006793PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006794"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006796Return True if all characters in S are whitespace\n\
6797and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798
6799static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006800unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801{
6802 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6803 register const Py_UNICODE *e;
6804
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 /* Shortcut for single character strings */
6806 if (PyUnicode_GET_SIZE(self) == 1 &&
6807 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006808 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006810 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006811 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006812 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006813
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 e = p + PyUnicode_GET_SIZE(self);
6815 for (; p < e; p++) {
6816 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006817 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006819 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820}
6821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006822PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006823"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006824\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006825Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006826and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006827
6828static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006829unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006830{
6831 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6832 register const Py_UNICODE *e;
6833
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006834 /* Shortcut for single character strings */
6835 if (PyUnicode_GET_SIZE(self) == 1 &&
6836 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006837 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006838
6839 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006840 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006841 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006842
6843 e = p + PyUnicode_GET_SIZE(self);
6844 for (; p < e; p++) {
6845 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006846 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006847 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006848 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006849}
6850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006851PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006852"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006853\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006854Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006855and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006856
6857static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006858unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006859{
6860 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6861 register const Py_UNICODE *e;
6862
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006863 /* Shortcut for single character strings */
6864 if (PyUnicode_GET_SIZE(self) == 1 &&
6865 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006866 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006867
6868 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006869 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006870 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006871
6872 e = p + PyUnicode_GET_SIZE(self);
6873 for (; p < e; p++) {
6874 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006875 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006876 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006877 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006878}
6879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006880PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006881"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006883Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006884False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885
6886static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006887unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888{
6889 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6890 register const Py_UNICODE *e;
6891
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 /* Shortcut for single character strings */
6893 if (PyUnicode_GET_SIZE(self) == 1 &&
6894 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006895 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006897 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006898 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006899 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006900
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 e = p + PyUnicode_GET_SIZE(self);
6902 for (; p < e; p++) {
6903 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006904 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006906 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907}
6908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006909PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006910"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006912Return True if all characters in S are digits\n\
6913and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914
6915static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006916unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917{
6918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6919 register const Py_UNICODE *e;
6920
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921 /* Shortcut for single character strings */
6922 if (PyUnicode_GET_SIZE(self) == 1 &&
6923 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006924 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006926 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006927 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006928 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006929
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 e = p + PyUnicode_GET_SIZE(self);
6931 for (; p < e; p++) {
6932 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006933 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006935 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936}
6937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006938PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006939"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006941Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006942False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943
6944static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006945unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946{
6947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6948 register const Py_UNICODE *e;
6949
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 /* Shortcut for single character strings */
6951 if (PyUnicode_GET_SIZE(self) == 1 &&
6952 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006953 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006955 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006956 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006957 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006958
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 e = p + PyUnicode_GET_SIZE(self);
6960 for (; p < e; p++) {
6961 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006962 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006964 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965}
6966
Martin v. Löwis47383402007-08-15 07:32:56 +00006967int
6968PyUnicode_IsIdentifier(PyObject *self)
6969{
6970 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6971 register const Py_UNICODE *e;
6972
6973 /* Special case for empty strings */
6974 if (PyUnicode_GET_SIZE(self) == 0)
6975 return 0;
6976
6977 /* PEP 3131 says that the first character must be in
6978 XID_Start and subsequent characters in XID_Continue,
6979 and for the ASCII range, the 2.x rules apply (i.e
6980 start with letters and underscore, continue with
6981 letters, digits, underscore). However, given the current
6982 definition of XID_Start and XID_Continue, it is sufficient
6983 to check just for these, except that _ must be allowed
6984 as starting an identifier. */
6985 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6986 return 0;
6987
6988 e = p + PyUnicode_GET_SIZE(self);
6989 for (p++; p < e; p++) {
6990 if (!_PyUnicode_IsXidContinue(*p))
6991 return 0;
6992 }
6993 return 1;
6994}
6995
6996PyDoc_STRVAR(isidentifier__doc__,
6997"S.isidentifier() -> bool\n\
6998\n\
6999Return True if S is a valid identifier according\n\
7000to the language definition.");
7001
7002static PyObject*
7003unicode_isidentifier(PyObject *self)
7004{
7005 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7006}
7007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007008PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009"S.join(sequence) -> unicode\n\
7010\n\
7011Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007012sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013
7014static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007015unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007017 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018}
7019
Martin v. Löwis18e16552006-02-15 17:27:45 +00007020static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021unicode_length(PyUnicodeObject *self)
7022{
7023 return self->length;
7024}
7025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007026PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007027"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028\n\
7029Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007030done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031
7032static PyObject *
7033unicode_ljust(PyUnicodeObject *self, PyObject *args)
7034{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007035 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007036 Py_UNICODE fillchar = ' ';
7037
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007038 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 return NULL;
7040
Tim Peters7a29bd52001-09-12 03:03:31 +00007041 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042 Py_INCREF(self);
7043 return (PyObject*) self;
7044 }
7045
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007046 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047}
7048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007049PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050"S.lower() -> unicode\n\
7051\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007052Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053
7054static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007055unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057 return fixup(self, fixlower);
7058}
7059
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007060#define LEFTSTRIP 0
7061#define RIGHTSTRIP 1
7062#define BOTHSTRIP 2
7063
7064/* Arrays indexed by above */
7065static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7066
7067#define STRIPNAME(i) (stripformat[i]+3)
7068
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007069/* externally visible for str.strip(unicode) */
7070PyObject *
7071_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7072{
7073 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007074 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007075 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007076 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7077 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007078
Thomas Wouters477c8d52006-05-27 19:21:47 +00007079 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7080
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007081 i = 0;
7082 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007083 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7084 i++;
7085 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007086 }
7087
7088 j = len;
7089 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007090 do {
7091 j--;
7092 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7093 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007094 }
7095
7096 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007097 Py_INCREF(self);
7098 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007099 }
7100 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007101 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007102}
7103
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104
7105static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007106do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007108 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007109 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007110
7111 i = 0;
7112 if (striptype != RIGHTSTRIP) {
7113 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7114 i++;
7115 }
7116 }
7117
7118 j = len;
7119 if (striptype != LEFTSTRIP) {
7120 do {
7121 j--;
7122 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7123 j++;
7124 }
7125
7126 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7127 Py_INCREF(self);
7128 return (PyObject*)self;
7129 }
7130 else
7131 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132}
7133
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007134
7135static PyObject *
7136do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7137{
7138 PyObject *sep = NULL;
7139
7140 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7141 return NULL;
7142
7143 if (sep != NULL && sep != Py_None) {
7144 if (PyUnicode_Check(sep))
7145 return _PyUnicode_XStrip(self, striptype, sep);
7146 else if (PyString_Check(sep)) {
7147 PyObject *res;
7148 sep = PyUnicode_FromObject(sep);
7149 if (sep==NULL)
7150 return NULL;
7151 res = _PyUnicode_XStrip(self, striptype, sep);
7152 Py_DECREF(sep);
7153 return res;
7154 }
7155 else {
7156 PyErr_Format(PyExc_TypeError,
7157 "%s arg must be None, unicode or str",
7158 STRIPNAME(striptype));
7159 return NULL;
7160 }
7161 }
7162
7163 return do_strip(self, striptype);
7164}
7165
7166
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007167PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007168"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007169\n\
7170Return a copy of the string S with leading and trailing\n\
7171whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007172If chars is given and not None, remove characters in chars instead.\n\
7173If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007174
7175static PyObject *
7176unicode_strip(PyUnicodeObject *self, PyObject *args)
7177{
7178 if (PyTuple_GET_SIZE(args) == 0)
7179 return do_strip(self, BOTHSTRIP); /* Common case */
7180 else
7181 return do_argstrip(self, BOTHSTRIP, args);
7182}
7183
7184
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007185PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007186"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007187\n\
7188Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007189If chars is given and not None, remove characters in chars instead.\n\
7190If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007191
7192static PyObject *
7193unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7194{
7195 if (PyTuple_GET_SIZE(args) == 0)
7196 return do_strip(self, LEFTSTRIP); /* Common case */
7197 else
7198 return do_argstrip(self, LEFTSTRIP, args);
7199}
7200
7201
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007202PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007203"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007204\n\
7205Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007206If chars is given and not None, remove characters in chars instead.\n\
7207If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007208
7209static PyObject *
7210unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7211{
7212 if (PyTuple_GET_SIZE(args) == 0)
7213 return do_strip(self, RIGHTSTRIP); /* Common case */
7214 else
7215 return do_argstrip(self, RIGHTSTRIP, args);
7216}
7217
7218
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007220unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221{
7222 PyUnicodeObject *u;
7223 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007224 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007225 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226
7227 if (len < 0)
7228 len = 0;
7229
Tim Peters7a29bd52001-09-12 03:03:31 +00007230 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 /* no repeat, return original string */
7232 Py_INCREF(str);
7233 return (PyObject*) str;
7234 }
Tim Peters8f422462000-09-09 06:13:41 +00007235
7236 /* ensure # of chars needed doesn't overflow int and # of bytes
7237 * needed doesn't overflow size_t
7238 */
7239 nchars = len * str->length;
7240 if (len && nchars / len != str->length) {
7241 PyErr_SetString(PyExc_OverflowError,
7242 "repeated string is too long");
7243 return NULL;
7244 }
7245 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7246 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7247 PyErr_SetString(PyExc_OverflowError,
7248 "repeated string is too long");
7249 return NULL;
7250 }
7251 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 if (!u)
7253 return NULL;
7254
7255 p = u->str;
7256
Thomas Wouters477c8d52006-05-27 19:21:47 +00007257 if (str->length == 1 && len > 0) {
7258 Py_UNICODE_FILL(p, str->str[0], len);
7259 } else {
7260 Py_ssize_t done = 0; /* number of characters copied this far */
7261 if (done < nchars) {
7262 Py_UNICODE_COPY(p, str->str, str->length);
7263 done = str->length;
7264 }
7265 while (done < nchars) {
7266 int n = (done <= nchars-done) ? done : nchars-done;
7267 Py_UNICODE_COPY(p+done, p, n);
7268 done += n;
7269 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270 }
7271
7272 return (PyObject*) u;
7273}
7274
7275PyObject *PyUnicode_Replace(PyObject *obj,
7276 PyObject *subobj,
7277 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007278 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279{
7280 PyObject *self;
7281 PyObject *str1;
7282 PyObject *str2;
7283 PyObject *result;
7284
7285 self = PyUnicode_FromObject(obj);
7286 if (self == NULL)
7287 return NULL;
7288 str1 = PyUnicode_FromObject(subobj);
7289 if (str1 == NULL) {
7290 Py_DECREF(self);
7291 return NULL;
7292 }
7293 str2 = PyUnicode_FromObject(replobj);
7294 if (str2 == NULL) {
7295 Py_DECREF(self);
7296 Py_DECREF(str1);
7297 return NULL;
7298 }
Tim Petersced69f82003-09-16 20:30:58 +00007299 result = replace((PyUnicodeObject *)self,
7300 (PyUnicodeObject *)str1,
7301 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 maxcount);
7303 Py_DECREF(self);
7304 Py_DECREF(str1);
7305 Py_DECREF(str2);
7306 return result;
7307}
7308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007309PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310"S.replace (old, new[, maxsplit]) -> unicode\n\
7311\n\
7312Return a copy of S with all occurrences of substring\n\
7313old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007314given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315
7316static PyObject*
7317unicode_replace(PyUnicodeObject *self, PyObject *args)
7318{
7319 PyUnicodeObject *str1;
7320 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007321 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322 PyObject *result;
7323
Martin v. Löwis18e16552006-02-15 17:27:45 +00007324 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325 return NULL;
7326 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7327 if (str1 == NULL)
7328 return NULL;
7329 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007330 if (str2 == NULL) {
7331 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007333 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334
7335 result = replace(self, str1, str2, maxcount);
7336
7337 Py_DECREF(str1);
7338 Py_DECREF(str2);
7339 return result;
7340}
7341
7342static
7343PyObject *unicode_repr(PyObject *unicode)
7344{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007345 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007346 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007347 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7348 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7349
7350 /* XXX(nnorwitz): rather than over-allocating, it would be
7351 better to choose a different scheme. Perhaps scan the
7352 first N-chars of the string and allocate based on that size.
7353 */
7354 /* Initial allocation is based on the longest-possible unichr
7355 escape.
7356
7357 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7358 unichr, so in this case it's the longest unichr escape. In
7359 narrow (UTF-16) builds this is five chars per source unichr
7360 since there are two unichrs in the surrogate pair, so in narrow
7361 (UTF-16) builds it's not the longest unichr escape.
7362
7363 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7364 so in the narrow (UTF-16) build case it's the longest unichr
7365 escape.
7366 */
7367
Walter Dörwald1ab83302007-05-18 17:15:44 +00007368 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007369 2 /* quotes */
7370#ifdef Py_UNICODE_WIDE
7371 + 10*size
7372#else
7373 + 6*size
7374#endif
7375 + 1);
7376 if (repr == NULL)
7377 return NULL;
7378
Walter Dörwald1ab83302007-05-18 17:15:44 +00007379 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007380
7381 /* Add quote */
7382 *p++ = (findchar(s, size, '\'') &&
7383 !findchar(s, size, '"')) ? '"' : '\'';
7384 while (size-- > 0) {
7385 Py_UNICODE ch = *s++;
7386
7387 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007388 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007389 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007390 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007391 continue;
7392 }
7393
7394#ifdef Py_UNICODE_WIDE
7395 /* Map 21-bit characters to '\U00xxxxxx' */
7396 else if (ch >= 0x10000) {
7397 *p++ = '\\';
7398 *p++ = 'U';
7399 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7400 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7401 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7402 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7403 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7404 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7405 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7406 *p++ = hexdigits[ch & 0x0000000F];
7407 continue;
7408 }
7409#else
7410 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7411 else if (ch >= 0xD800 && ch < 0xDC00) {
7412 Py_UNICODE ch2;
7413 Py_UCS4 ucs;
7414
7415 ch2 = *s++;
7416 size--;
7417 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7418 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7419 *p++ = '\\';
7420 *p++ = 'U';
7421 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7422 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7423 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7424 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7425 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7426 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7427 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7428 *p++ = hexdigits[ucs & 0x0000000F];
7429 continue;
7430 }
7431 /* Fall through: isolated surrogates are copied as-is */
7432 s--;
7433 size++;
7434 }
7435#endif
7436
7437 /* Map 16-bit characters to '\uxxxx' */
7438 if (ch >= 256) {
7439 *p++ = '\\';
7440 *p++ = 'u';
7441 *p++ = hexdigits[(ch >> 12) & 0x000F];
7442 *p++ = hexdigits[(ch >> 8) & 0x000F];
7443 *p++ = hexdigits[(ch >> 4) & 0x000F];
7444 *p++ = hexdigits[ch & 0x000F];
7445 }
7446
7447 /* Map special whitespace to '\t', \n', '\r' */
7448 else if (ch == '\t') {
7449 *p++ = '\\';
7450 *p++ = 't';
7451 }
7452 else if (ch == '\n') {
7453 *p++ = '\\';
7454 *p++ = 'n';
7455 }
7456 else if (ch == '\r') {
7457 *p++ = '\\';
7458 *p++ = 'r';
7459 }
7460
7461 /* Map non-printable US ASCII to '\xhh' */
7462 else if (ch < ' ' || ch >= 0x7F) {
7463 *p++ = '\\';
7464 *p++ = 'x';
7465 *p++ = hexdigits[(ch >> 4) & 0x000F];
7466 *p++ = hexdigits[ch & 0x000F];
7467 }
7468
7469 /* Copy everything else as-is */
7470 else
7471 *p++ = (char) ch;
7472 }
7473 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007474 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007475
7476 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007477 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007478 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479}
7480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007481PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482"S.rfind(sub [,start [,end]]) -> int\n\
7483\n\
7484Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007485such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486arguments start and end are interpreted as in slice notation.\n\
7487\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007488Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489
7490static PyObject *
7491unicode_rfind(PyUnicodeObject *self, PyObject *args)
7492{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007493 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007494 Py_ssize_t start;
7495 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007496 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497
Christian Heimes9cd17752007-11-18 19:35:23 +00007498 if (!_ParseTupleFinds(args, &substring, &start, &end))
7499 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500
Thomas Wouters477c8d52006-05-27 19:21:47 +00007501 result = stringlib_rfind_slice(
7502 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7503 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7504 start, end
7505 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506
7507 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007508
7509 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510}
7511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007512PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513"S.rindex(sub [,start [,end]]) -> int\n\
7514\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007515Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516
7517static PyObject *
7518unicode_rindex(PyUnicodeObject *self, PyObject *args)
7519{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007520 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007521 Py_ssize_t start;
7522 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007523 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524
Christian Heimes9cd17752007-11-18 19:35:23 +00007525 if (!_ParseTupleFinds(args, &substring, &start, &end))
7526 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527
Thomas Wouters477c8d52006-05-27 19:21:47 +00007528 result = stringlib_rfind_slice(
7529 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7530 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7531 start, end
7532 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533
7534 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007535
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 if (result < 0) {
7537 PyErr_SetString(PyExc_ValueError, "substring not found");
7538 return NULL;
7539 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007540 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541}
7542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007543PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007544"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545\n\
7546Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007547done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548
7549static PyObject *
7550unicode_rjust(PyUnicodeObject *self, PyObject *args)
7551{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007552 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007553 Py_UNICODE fillchar = ' ';
7554
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007555 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 return NULL;
7557
Tim Peters7a29bd52001-09-12 03:03:31 +00007558 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559 Py_INCREF(self);
7560 return (PyObject*) self;
7561 }
7562
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007563 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564}
7565
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566PyObject *PyUnicode_Split(PyObject *s,
7567 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007568 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569{
7570 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007571
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 s = PyUnicode_FromObject(s);
7573 if (s == NULL)
7574 return NULL;
7575 if (sep != NULL) {
7576 sep = PyUnicode_FromObject(sep);
7577 if (sep == NULL) {
7578 Py_DECREF(s);
7579 return NULL;
7580 }
7581 }
7582
7583 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7584
7585 Py_DECREF(s);
7586 Py_XDECREF(sep);
7587 return result;
7588}
7589
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007590PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591"S.split([sep [,maxsplit]]) -> list of strings\n\
7592\n\
7593Return a list of the words in S, using sep as the\n\
7594delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007595splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007596any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597
7598static PyObject*
7599unicode_split(PyUnicodeObject *self, PyObject *args)
7600{
7601 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007602 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603
Martin v. Löwis18e16552006-02-15 17:27:45 +00007604 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605 return NULL;
7606
7607 if (substring == Py_None)
7608 return split(self, NULL, maxcount);
7609 else if (PyUnicode_Check(substring))
7610 return split(self, (PyUnicodeObject *)substring, maxcount);
7611 else
7612 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7613}
7614
Thomas Wouters477c8d52006-05-27 19:21:47 +00007615PyObject *
7616PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7617{
7618 PyObject* str_obj;
7619 PyObject* sep_obj;
7620 PyObject* out;
7621
7622 str_obj = PyUnicode_FromObject(str_in);
7623 if (!str_obj)
7624 return NULL;
7625 sep_obj = PyUnicode_FromObject(sep_in);
7626 if (!sep_obj) {
7627 Py_DECREF(str_obj);
7628 return NULL;
7629 }
7630
7631 out = stringlib_partition(
7632 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7633 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7634 );
7635
7636 Py_DECREF(sep_obj);
7637 Py_DECREF(str_obj);
7638
7639 return out;
7640}
7641
7642
7643PyObject *
7644PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7645{
7646 PyObject* str_obj;
7647 PyObject* sep_obj;
7648 PyObject* out;
7649
7650 str_obj = PyUnicode_FromObject(str_in);
7651 if (!str_obj)
7652 return NULL;
7653 sep_obj = PyUnicode_FromObject(sep_in);
7654 if (!sep_obj) {
7655 Py_DECREF(str_obj);
7656 return NULL;
7657 }
7658
7659 out = stringlib_rpartition(
7660 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7661 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7662 );
7663
7664 Py_DECREF(sep_obj);
7665 Py_DECREF(str_obj);
7666
7667 return out;
7668}
7669
7670PyDoc_STRVAR(partition__doc__,
7671"S.partition(sep) -> (head, sep, tail)\n\
7672\n\
7673Searches for the separator sep in S, and returns the part before it,\n\
7674the separator itself, and the part after it. If the separator is not\n\
7675found, returns S and two empty strings.");
7676
7677static PyObject*
7678unicode_partition(PyUnicodeObject *self, PyObject *separator)
7679{
7680 return PyUnicode_Partition((PyObject *)self, separator);
7681}
7682
7683PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007684"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007685\n\
7686Searches for the separator sep in S, starting at the end of S, and returns\n\
7687the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007688separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007689
7690static PyObject*
7691unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7692{
7693 return PyUnicode_RPartition((PyObject *)self, separator);
7694}
7695
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007696PyObject *PyUnicode_RSplit(PyObject *s,
7697 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007698 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007699{
7700 PyObject *result;
7701
7702 s = PyUnicode_FromObject(s);
7703 if (s == NULL)
7704 return NULL;
7705 if (sep != NULL) {
7706 sep = PyUnicode_FromObject(sep);
7707 if (sep == NULL) {
7708 Py_DECREF(s);
7709 return NULL;
7710 }
7711 }
7712
7713 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7714
7715 Py_DECREF(s);
7716 Py_XDECREF(sep);
7717 return result;
7718}
7719
7720PyDoc_STRVAR(rsplit__doc__,
7721"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7722\n\
7723Return a list of the words in S, using sep as the\n\
7724delimiter string, starting at the end of the string and\n\
7725working to the front. If maxsplit is given, at most maxsplit\n\
7726splits are done. If sep is not specified, any whitespace string\n\
7727is a separator.");
7728
7729static PyObject*
7730unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7731{
7732 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007733 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007734
Martin v. Löwis18e16552006-02-15 17:27:45 +00007735 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007736 return NULL;
7737
7738 if (substring == Py_None)
7739 return rsplit(self, NULL, maxcount);
7740 else if (PyUnicode_Check(substring))
7741 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7742 else
7743 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7744}
7745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007746PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007747"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748\n\
7749Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007750Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007751is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752
7753static PyObject*
7754unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7755{
Guido van Rossum86662912000-04-11 15:38:46 +00007756 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757
Guido van Rossum86662912000-04-11 15:38:46 +00007758 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759 return NULL;
7760
Guido van Rossum86662912000-04-11 15:38:46 +00007761 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762}
7763
7764static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007765PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766{
Walter Dörwald346737f2007-05-31 10:44:43 +00007767 if (PyUnicode_CheckExact(self)) {
7768 Py_INCREF(self);
7769 return self;
7770 } else
7771 /* Subtype -- return genuine unicode string with the same value. */
7772 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7773 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774}
7775
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007776PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777"S.swapcase() -> unicode\n\
7778\n\
7779Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007780and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781
7782static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007783unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785 return fixup(self, fixswapcase);
7786}
7787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007788PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789"S.translate(table) -> unicode\n\
7790\n\
7791Return a copy of the string S, where all characters have been mapped\n\
7792through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007793Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7794Unmapped characters are left untouched. Characters mapped to None\n\
7795are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796
7797static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007798unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799{
Georg Brandl94c2c752007-10-23 06:52:59 +00007800 PyObject *newtable = NULL;
7801 Py_ssize_t i = 0;
7802 PyObject *key, *value, *result;
7803
7804 if (!PyDict_Check(table)) {
7805 PyErr_SetString(PyExc_TypeError, "translate argument must be a dict");
7806 return NULL;
7807 }
7808 /* fixup the table -- allow size-1 string keys instead of only int keys */
7809 newtable = PyDict_Copy(table);
7810 if (!newtable) return NULL;
7811 while (PyDict_Next(table, &i, &key, &value)) {
7812 if (PyUnicode_Check(key)) {
7813 /* convert string keys to integer keys */
7814 PyObject *newkey;
7815 int res;
7816 if (PyUnicode_GET_SIZE(key) != 1) {
7817 PyErr_SetString(PyExc_ValueError, "string items in translate "
7818 "table must be 1 element long");
7819 goto err;
7820 }
7821 newkey = PyInt_FromLong(PyUnicode_AS_UNICODE(key)[0]);
7822 if (!newkey)
7823 goto err;
7824 res = PyDict_SetItem(newtable, newkey, value);
7825 Py_DECREF(newkey);
7826 if (res < 0)
7827 goto err;
7828 } else if (PyInt_Check(key)) {
7829 /* just keep integer keys */
7830 if (PyDict_SetItem(newtable, key, value) < 0)
7831 goto err;
7832 } else {
7833 PyErr_SetString(PyExc_TypeError, "items in translate table must be "
7834 "strings or integers");
7835 goto err;
7836 }
7837 }
7838
7839 result = PyUnicode_TranslateCharmap(self->str,
7840 self->length,
7841 newtable,
7842 "ignore");
7843 Py_DECREF(newtable);
7844 return result;
7845 err:
7846 Py_DECREF(newtable);
7847 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848}
7849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007850PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851"S.upper() -> unicode\n\
7852\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007853Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854
7855static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007856unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858 return fixup(self, fixupper);
7859}
7860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007861PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862"S.zfill(width) -> unicode\n\
7863\n\
7864Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007865of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866
7867static PyObject *
7868unicode_zfill(PyUnicodeObject *self, PyObject *args)
7869{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007870 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 PyUnicodeObject *u;
7872
Martin v. Löwis18e16552006-02-15 17:27:45 +00007873 Py_ssize_t width;
7874 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875 return NULL;
7876
7877 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007878 if (PyUnicode_CheckExact(self)) {
7879 Py_INCREF(self);
7880 return (PyObject*) self;
7881 }
7882 else
7883 return PyUnicode_FromUnicode(
7884 PyUnicode_AS_UNICODE(self),
7885 PyUnicode_GET_SIZE(self)
7886 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 }
7888
7889 fill = width - self->length;
7890
7891 u = pad(self, fill, 0, '0');
7892
Walter Dörwald068325e2002-04-15 13:36:47 +00007893 if (u == NULL)
7894 return NULL;
7895
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896 if (u->str[fill] == '+' || u->str[fill] == '-') {
7897 /* move sign to beginning of string */
7898 u->str[0] = u->str[fill];
7899 u->str[fill] = '0';
7900 }
7901
7902 return (PyObject*) u;
7903}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904
7905#if 0
7906static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007907unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909 return PyInt_FromLong(unicode_freelist_size);
7910}
7911#endif
7912
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007913PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007914"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007915\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007916Return True if S starts with the specified prefix, False otherwise.\n\
7917With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007918With optional end, stop comparing S at that position.\n\
7919prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920
7921static PyObject *
7922unicode_startswith(PyUnicodeObject *self,
7923 PyObject *args)
7924{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007925 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007927 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007928 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007929 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007931 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007932 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007934 if (PyTuple_Check(subobj)) {
7935 Py_ssize_t i;
7936 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7937 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7938 PyTuple_GET_ITEM(subobj, i));
7939 if (substring == NULL)
7940 return NULL;
7941 result = tailmatch(self, substring, start, end, -1);
7942 Py_DECREF(substring);
7943 if (result) {
7944 Py_RETURN_TRUE;
7945 }
7946 }
7947 /* nothing matched */
7948 Py_RETURN_FALSE;
7949 }
7950 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007952 return NULL;
7953 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007955 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956}
7957
7958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007959PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007960"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007962Return True if S ends with the specified suffix, False otherwise.\n\
7963With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007964With optional end, stop comparing S at that position.\n\
7965suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966
7967static PyObject *
7968unicode_endswith(PyUnicodeObject *self,
7969 PyObject *args)
7970{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007971 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007973 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007974 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007975 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007977 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7978 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007980 if (PyTuple_Check(subobj)) {
7981 Py_ssize_t i;
7982 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7983 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7984 PyTuple_GET_ITEM(subobj, i));
7985 if (substring == NULL)
7986 return NULL;
7987 result = tailmatch(self, substring, start, end, +1);
7988 Py_DECREF(substring);
7989 if (result) {
7990 Py_RETURN_TRUE;
7991 }
7992 }
7993 Py_RETURN_FALSE;
7994 }
7995 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007997 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007999 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008001 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002}
8003
Eric Smith8c663262007-08-25 02:26:07 +00008004#include "stringlib/string_format.h"
8005
8006PyDoc_STRVAR(format__doc__,
8007"S.format(*args, **kwargs) -> unicode\n\
8008\n\
8009");
8010
Eric Smith8c663262007-08-25 02:26:07 +00008011PyDoc_STRVAR(p_format__doc__,
8012"S.__format__(format_spec) -> unicode\n\
8013\n\
8014");
8015
8016static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008017unicode_getnewargs(PyUnicodeObject *v)
8018{
8019 return Py_BuildValue("(u#)", v->str, v->length);
8020}
8021
8022
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023static PyMethodDef unicode_methods[] = {
8024
8025 /* Order is according to common usage: often used methods should
8026 appear first, since lookup is done sequentially. */
8027
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008028 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8029 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8030 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008031 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008032 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8033 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8034 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8035 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8036 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8037 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8038 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008039 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008040 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8041 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8042 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008043 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008044 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8045 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8046 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008047 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008048 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008049 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008050 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008051 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8052 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8053 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8054 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8055 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8056 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8057 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8058 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8059 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8060 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8061 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8062 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8063 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8064 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008065 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008066 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008067 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8068 {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008069 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8070 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Walter Dörwald068325e2002-04-15 13:36:47 +00008071#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008072 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073#endif
8074
8075#if 0
8076 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008077 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078#endif
8079
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008080 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081 {NULL, NULL}
8082};
8083
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008084static PyObject *
8085unicode_mod(PyObject *v, PyObject *w)
8086{
8087 if (!PyUnicode_Check(v)) {
8088 Py_INCREF(Py_NotImplemented);
8089 return Py_NotImplemented;
8090 }
8091 return PyUnicode_Format(v, w);
8092}
8093
8094static PyNumberMethods unicode_as_number = {
8095 0, /*nb_add*/
8096 0, /*nb_subtract*/
8097 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008098 unicode_mod, /*nb_remainder*/
8099};
8100
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008102 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008103 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008104 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8105 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008106 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 0, /* sq_ass_item */
8108 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008109 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110};
8111
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008112static PyObject*
8113unicode_subscript(PyUnicodeObject* self, PyObject* item)
8114{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008115 if (PyIndex_Check(item)) {
8116 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008117 if (i == -1 && PyErr_Occurred())
8118 return NULL;
8119 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008120 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008121 return unicode_getitem(self, i);
8122 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008123 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008124 Py_UNICODE* source_buf;
8125 Py_UNICODE* result_buf;
8126 PyObject* result;
8127
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008128 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008129 &start, &stop, &step, &slicelength) < 0) {
8130 return NULL;
8131 }
8132
8133 if (slicelength <= 0) {
8134 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008135 } else if (start == 0 && step == 1 && slicelength == self->length &&
8136 PyUnicode_CheckExact(self)) {
8137 Py_INCREF(self);
8138 return (PyObject *)self;
8139 } else if (step == 1) {
8140 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008141 } else {
8142 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008143 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8144 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008145
8146 if (result_buf == NULL)
8147 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008148
8149 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8150 result_buf[i] = source_buf[cur];
8151 }
Tim Petersced69f82003-09-16 20:30:58 +00008152
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008153 result = PyUnicode_FromUnicode(result_buf, slicelength);
8154 PyMem_FREE(result_buf);
8155 return result;
8156 }
8157 } else {
8158 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8159 return NULL;
8160 }
8161}
8162
8163static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008164 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008165 (binaryfunc)unicode_subscript, /* mp_subscript */
8166 (objobjargproc)0, /* mp_ass_subscript */
8167};
8168
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170/* Helpers for PyUnicode_Format() */
8171
8172static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008173getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008175 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176 if (argidx < arglen) {
8177 (*p_argidx)++;
8178 if (arglen < 0)
8179 return args;
8180 else
8181 return PyTuple_GetItem(args, argidx);
8182 }
8183 PyErr_SetString(PyExc_TypeError,
8184 "not enough arguments for format string");
8185 return NULL;
8186}
8187
Martin v. Löwis18e16552006-02-15 17:27:45 +00008188static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008189strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008191 register Py_ssize_t i;
8192 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008193 for (i = len - 1; i >= 0; i--)
8194 buffer[i] = (Py_UNICODE) charbuffer[i];
8195
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196 return len;
8197}
8198
Neal Norwitzfc76d632006-01-10 06:03:13 +00008199static int
8200doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8201{
Tim Peters15231542006-02-16 01:08:01 +00008202 Py_ssize_t result;
8203
Neal Norwitzfc76d632006-01-10 06:03:13 +00008204 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008205 result = strtounicode(buffer, (char *)buffer);
8206 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008207}
8208
8209static int
8210longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8211{
Tim Peters15231542006-02-16 01:08:01 +00008212 Py_ssize_t result;
8213
Neal Norwitzfc76d632006-01-10 06:03:13 +00008214 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008215 result = strtounicode(buffer, (char *)buffer);
8216 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008217}
8218
Guido van Rossum078151d2002-08-11 04:24:12 +00008219/* XXX To save some code duplication, formatfloat/long/int could have been
8220 shared with stringobject.c, converting from 8-bit to Unicode after the
8221 formatting is done. */
8222
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223static int
8224formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008225 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226 int flags,
8227 int prec,
8228 int type,
8229 PyObject *v)
8230{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008231 /* fmt = '%#.' + `prec` + `type`
8232 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233 char fmt[20];
8234 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008235
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 x = PyFloat_AsDouble(v);
8237 if (x == -1.0 && PyErr_Occurred())
8238 return -1;
8239 if (prec < 0)
8240 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8242 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008243 /* Worst case length calc to ensure no buffer overrun:
8244
8245 'g' formats:
8246 fmt = %#.<prec>g
8247 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8248 for any double rep.)
8249 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8250
8251 'f' formats:
8252 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8253 len = 1 + 50 + 1 + prec = 52 + prec
8254
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008255 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008256 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008257
8258 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008259 if (((type == 'g' || type == 'G') &&
8260 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008261 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008262 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008263 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008264 return -1;
8265 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008266 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8267 (flags&F_ALT) ? "#" : "",
8268 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008269 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270}
8271
Tim Peters38fd5b62000-09-21 05:43:11 +00008272static PyObject*
8273formatlong(PyObject *val, int flags, int prec, int type)
8274{
8275 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008276 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008277 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008278 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008279
8280 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8281 if (!str)
8282 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008283 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008284 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008285 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008286}
8287
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288static int
8289formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008290 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291 int flags,
8292 int prec,
8293 int type,
8294 PyObject *v)
8295{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008296 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008297 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8298 * + 1 + 1
8299 * = 24
8300 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008301 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008302 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303 long x;
8304
8305 x = PyInt_AsLong(v);
8306 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008307 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008308 if (x < 0 && type == 'u') {
8309 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008310 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008311 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8312 sign = "-";
8313 else
8314 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008316 prec = 1;
8317
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008318 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8319 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008320 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008321 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008322 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008323 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008324 return -1;
8325 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008326
8327 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008328 (type == 'x' || type == 'X' || type == 'o')) {
8329 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008330 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008331 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008332 * - when 0 is being converted, the C standard leaves off
8333 * the '0x' or '0X', which is inconsistent with other
8334 * %#x/%#X conversions and inconsistent with Python's
8335 * hex() function
8336 * - there are platforms that violate the standard and
8337 * convert 0 with the '0x' or '0X'
8338 * (Metrowerks, Compaq Tru64)
8339 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008340 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008341 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008342 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008343 * We can achieve the desired consistency by inserting our
8344 * own '0x' or '0X' prefix, and substituting %x/%X in place
8345 * of %#x/%#X.
8346 *
8347 * Note that this is the same approach as used in
8348 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008349 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008350 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8351 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008352 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008353 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008354 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8355 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008356 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008357 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008358 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008359 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008360 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008361 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362}
8363
8364static int
8365formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008366 size_t buflen,
8367 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008369 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008370 if (PyUnicode_Check(v)) {
8371 if (PyUnicode_GET_SIZE(v) != 1)
8372 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008376 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008377 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008378 goto onError;
8379 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381
8382 else {
8383 /* Integer input truncated to a character */
8384 long x;
8385 x = PyInt_AsLong(v);
8386 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008387 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008388#ifdef Py_UNICODE_WIDE
8389 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008390 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008391 "%c arg not in range(0x110000) "
8392 "(wide Python build)");
8393 return -1;
8394 }
8395#else
8396 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008397 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008398 "%c arg not in range(0x10000) "
8399 "(narrow Python build)");
8400 return -1;
8401 }
8402#endif
8403 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404 }
8405 buf[1] = '\0';
8406 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008407
8408 onError:
8409 PyErr_SetString(PyExc_TypeError,
8410 "%c requires int or char");
8411 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412}
8413
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008414/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8415
8416 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8417 chars are formatted. XXX This is a magic number. Each formatting
8418 routine does bounds checking to ensure no overflow, but a better
8419 solution may be to malloc a buffer of appropriate size for each
8420 format. For now, the current solution is sufficient.
8421*/
8422#define FORMATBUFLEN (size_t)120
8423
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424PyObject *PyUnicode_Format(PyObject *format,
8425 PyObject *args)
8426{
8427 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008428 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 int args_owned = 0;
8430 PyUnicodeObject *result = NULL;
8431 PyObject *dict = NULL;
8432 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008433
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 if (format == NULL || args == NULL) {
8435 PyErr_BadInternalCall();
8436 return NULL;
8437 }
8438 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008439 if (uformat == NULL)
8440 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008441 fmt = PyUnicode_AS_UNICODE(uformat);
8442 fmtcnt = PyUnicode_GET_SIZE(uformat);
8443
8444 reslen = rescnt = fmtcnt + 100;
8445 result = _PyUnicode_New(reslen);
8446 if (result == NULL)
8447 goto onError;
8448 res = PyUnicode_AS_UNICODE(result);
8449
8450 if (PyTuple_Check(args)) {
8451 arglen = PyTuple_Size(args);
8452 argidx = 0;
8453 }
8454 else {
8455 arglen = -1;
8456 argidx = -2;
8457 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008458 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Guido van Rossum3172c5d2007-10-16 18:12:55 +00008459 !PyString_Check(args) && !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460 dict = args;
8461
8462 while (--fmtcnt >= 0) {
8463 if (*fmt != '%') {
8464 if (--rescnt < 0) {
8465 rescnt = fmtcnt + 100;
8466 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008467 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008468 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8470 --rescnt;
8471 }
8472 *res++ = *fmt++;
8473 }
8474 else {
8475 /* Got a format specifier */
8476 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008477 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479 Py_UNICODE c = '\0';
8480 Py_UNICODE fill;
8481 PyObject *v = NULL;
8482 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008483 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008485 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008486 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487
8488 fmt++;
8489 if (*fmt == '(') {
8490 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008491 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492 PyObject *key;
8493 int pcount = 1;
8494
8495 if (dict == NULL) {
8496 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008497 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 goto onError;
8499 }
8500 ++fmt;
8501 --fmtcnt;
8502 keystart = fmt;
8503 /* Skip over balanced parentheses */
8504 while (pcount > 0 && --fmtcnt >= 0) {
8505 if (*fmt == ')')
8506 --pcount;
8507 else if (*fmt == '(')
8508 ++pcount;
8509 fmt++;
8510 }
8511 keylen = fmt - keystart - 1;
8512 if (fmtcnt < 0 || pcount > 0) {
8513 PyErr_SetString(PyExc_ValueError,
8514 "incomplete format key");
8515 goto onError;
8516 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008517#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008518 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519 then looked up since Python uses strings to hold
8520 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008521 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522 key = PyUnicode_EncodeUTF8(keystart,
8523 keylen,
8524 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008525#else
8526 key = PyUnicode_FromUnicode(keystart, keylen);
8527#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528 if (key == NULL)
8529 goto onError;
8530 if (args_owned) {
8531 Py_DECREF(args);
8532 args_owned = 0;
8533 }
8534 args = PyObject_GetItem(dict, key);
8535 Py_DECREF(key);
8536 if (args == NULL) {
8537 goto onError;
8538 }
8539 args_owned = 1;
8540 arglen = -1;
8541 argidx = -2;
8542 }
8543 while (--fmtcnt >= 0) {
8544 switch (c = *fmt++) {
8545 case '-': flags |= F_LJUST; continue;
8546 case '+': flags |= F_SIGN; continue;
8547 case ' ': flags |= F_BLANK; continue;
8548 case '#': flags |= F_ALT; continue;
8549 case '0': flags |= F_ZERO; continue;
8550 }
8551 break;
8552 }
8553 if (c == '*') {
8554 v = getnextarg(args, arglen, &argidx);
8555 if (v == NULL)
8556 goto onError;
8557 if (!PyInt_Check(v)) {
8558 PyErr_SetString(PyExc_TypeError,
8559 "* wants int");
8560 goto onError;
8561 }
8562 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008563 if (width == -1 && PyErr_Occurred())
8564 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565 if (width < 0) {
8566 flags |= F_LJUST;
8567 width = -width;
8568 }
8569 if (--fmtcnt >= 0)
8570 c = *fmt++;
8571 }
8572 else if (c >= '0' && c <= '9') {
8573 width = c - '0';
8574 while (--fmtcnt >= 0) {
8575 c = *fmt++;
8576 if (c < '0' || c > '9')
8577 break;
8578 if ((width*10) / 10 != width) {
8579 PyErr_SetString(PyExc_ValueError,
8580 "width too big");
8581 goto onError;
8582 }
8583 width = width*10 + (c - '0');
8584 }
8585 }
8586 if (c == '.') {
8587 prec = 0;
8588 if (--fmtcnt >= 0)
8589 c = *fmt++;
8590 if (c == '*') {
8591 v = getnextarg(args, arglen, &argidx);
8592 if (v == NULL)
8593 goto onError;
8594 if (!PyInt_Check(v)) {
8595 PyErr_SetString(PyExc_TypeError,
8596 "* wants int");
8597 goto onError;
8598 }
8599 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008600 if (prec == -1 && PyErr_Occurred())
8601 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602 if (prec < 0)
8603 prec = 0;
8604 if (--fmtcnt >= 0)
8605 c = *fmt++;
8606 }
8607 else if (c >= '0' && c <= '9') {
8608 prec = c - '0';
8609 while (--fmtcnt >= 0) {
8610 c = Py_CHARMASK(*fmt++);
8611 if (c < '0' || c > '9')
8612 break;
8613 if ((prec*10) / 10 != prec) {
8614 PyErr_SetString(PyExc_ValueError,
8615 "prec too big");
8616 goto onError;
8617 }
8618 prec = prec*10 + (c - '0');
8619 }
8620 }
8621 } /* prec */
8622 if (fmtcnt >= 0) {
8623 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 if (--fmtcnt >= 0)
8625 c = *fmt++;
8626 }
8627 }
8628 if (fmtcnt < 0) {
8629 PyErr_SetString(PyExc_ValueError,
8630 "incomplete format");
8631 goto onError;
8632 }
8633 if (c != '%') {
8634 v = getnextarg(args, arglen, &argidx);
8635 if (v == NULL)
8636 goto onError;
8637 }
8638 sign = 0;
8639 fill = ' ';
8640 switch (c) {
8641
8642 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008643 pbuf = formatbuf;
8644 /* presume that buffer length is at least 1 */
8645 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 len = 1;
8647 break;
8648
8649 case 's':
8650 case 'r':
8651 if (PyUnicode_Check(v) && c == 's') {
8652 temp = v;
8653 Py_INCREF(temp);
8654 }
8655 else {
8656 PyObject *unicode;
8657 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00008658 temp = PyObject_Str(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 else
8660 temp = PyObject_Repr(v);
8661 if (temp == NULL)
8662 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008663 if (PyUnicode_Check(temp))
8664 /* nothing to do */;
8665 else if (PyString_Check(temp)) {
8666 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008667 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008669 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008671 Py_DECREF(temp);
8672 temp = unicode;
8673 if (temp == NULL)
8674 goto onError;
8675 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008676 else {
8677 Py_DECREF(temp);
8678 PyErr_SetString(PyExc_TypeError,
8679 "%s argument has non-string str()");
8680 goto onError;
8681 }
8682 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008683 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 len = PyUnicode_GET_SIZE(temp);
8685 if (prec >= 0 && len > prec)
8686 len = prec;
8687 break;
8688
8689 case 'i':
8690 case 'd':
8691 case 'u':
8692 case 'o':
8693 case 'x':
8694 case 'X':
8695 if (c == 'i')
8696 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008697 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008698 temp = formatlong(v, flags, prec, c);
8699 if (!temp)
8700 goto onError;
8701 pbuf = PyUnicode_AS_UNICODE(temp);
8702 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008703 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008705 else {
8706 pbuf = formatbuf;
8707 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8708 flags, prec, c, v);
8709 if (len < 0)
8710 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008711 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008712 }
8713 if (flags & F_ZERO)
8714 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715 break;
8716
8717 case 'e':
8718 case 'E':
8719 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008720 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 case 'g':
8722 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008723 if (c == 'F')
8724 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008725 pbuf = formatbuf;
8726 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8727 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728 if (len < 0)
8729 goto onError;
8730 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008731 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732 fill = '0';
8733 break;
8734
8735 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008736 pbuf = formatbuf;
8737 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738 if (len < 0)
8739 goto onError;
8740 break;
8741
8742 default:
8743 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008744 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008745 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008746 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008747 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008748 (Py_ssize_t)(fmt - 1 -
8749 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750 goto onError;
8751 }
8752 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008753 if (*pbuf == '-' || *pbuf == '+') {
8754 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755 len--;
8756 }
8757 else if (flags & F_SIGN)
8758 sign = '+';
8759 else if (flags & F_BLANK)
8760 sign = ' ';
8761 else
8762 sign = 0;
8763 }
8764 if (width < len)
8765 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008766 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767 reslen -= rescnt;
8768 rescnt = width + fmtcnt + 100;
8769 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008770 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008771 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008772 PyErr_NoMemory();
8773 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008774 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008775 if (_PyUnicode_Resize(&result, reslen) < 0) {
8776 Py_XDECREF(temp);
8777 goto onError;
8778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779 res = PyUnicode_AS_UNICODE(result)
8780 + reslen - rescnt;
8781 }
8782 if (sign) {
8783 if (fill != ' ')
8784 *res++ = sign;
8785 rescnt--;
8786 if (width > len)
8787 width--;
8788 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008789 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008790 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008791 assert(pbuf[1] == c);
8792 if (fill != ' ') {
8793 *res++ = *pbuf++;
8794 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008795 }
Tim Petersfff53252001-04-12 18:38:48 +00008796 rescnt -= 2;
8797 width -= 2;
8798 if (width < 0)
8799 width = 0;
8800 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008802 if (width > len && !(flags & F_LJUST)) {
8803 do {
8804 --rescnt;
8805 *res++ = fill;
8806 } while (--width > len);
8807 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008808 if (fill == ' ') {
8809 if (sign)
8810 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008811 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008812 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008813 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008814 *res++ = *pbuf++;
8815 *res++ = *pbuf++;
8816 }
8817 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008818 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819 res += len;
8820 rescnt -= len;
8821 while (--width >= len) {
8822 --rescnt;
8823 *res++ = ' ';
8824 }
8825 if (dict && (argidx < arglen) && c != '%') {
8826 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008827 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008828 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829 goto onError;
8830 }
8831 Py_XDECREF(temp);
8832 } /* '%' */
8833 } /* until end */
8834 if (argidx < arglen && !dict) {
8835 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008836 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008837 goto onError;
8838 }
8839
Thomas Woutersa96affe2006-03-12 00:29:36 +00008840 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8841 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842 if (args_owned) {
8843 Py_DECREF(args);
8844 }
8845 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 return (PyObject *)result;
8847
8848 onError:
8849 Py_XDECREF(result);
8850 Py_DECREF(uformat);
8851 if (args_owned) {
8852 Py_DECREF(args);
8853 }
8854 return NULL;
8855}
8856
Jeremy Hylton938ace62002-07-17 16:30:39 +00008857static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008858unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8859
Tim Peters6d6c1a32001-08-02 04:15:00 +00008860static PyObject *
8861unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8862{
8863 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008864 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008865 char *encoding = NULL;
8866 char *errors = NULL;
8867
Guido van Rossume023fe02001-08-30 03:12:59 +00008868 if (type != &PyUnicode_Type)
8869 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008870 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8871 kwlist, &x, &encoding, &errors))
8872 return NULL;
8873 if (x == NULL)
8874 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008875 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00008876 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008877 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008878 return PyUnicode_FromEncodedObject(x, encoding, errors);
8879}
8880
Guido van Rossume023fe02001-08-30 03:12:59 +00008881static PyObject *
8882unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8883{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008884 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008885 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008886
8887 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8888 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8889 if (tmp == NULL)
8890 return NULL;
8891 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008892 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008893 if (pnew == NULL) {
8894 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008895 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008896 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008897 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8898 if (pnew->str == NULL) {
8899 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008900 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008901 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008902 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008903 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008904 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8905 pnew->length = n;
8906 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008907 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008908 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008909}
8910
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008911PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008912"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008913\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008914Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008915encoding defaults to the current default string encoding.\n\
8916errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008917
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008918static PyObject *unicode_iter(PyObject *seq);
8919
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008921 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008922 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923 sizeof(PyUnicodeObject), /* tp_size */
8924 0, /* tp_itemsize */
8925 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008926 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008928 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008930 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008931 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008932 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008934 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935 (hashfunc) unicode_hash, /* tp_hash*/
8936 0, /* tp_call*/
8937 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008938 PyObject_GenericGetAttr, /* tp_getattro */
8939 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00008940 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008941 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8942 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008943 unicode_doc, /* tp_doc */
8944 0, /* tp_traverse */
8945 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008946 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008947 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008948 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008949 0, /* tp_iternext */
8950 unicode_methods, /* tp_methods */
8951 0, /* tp_members */
8952 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00008953 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008954 0, /* tp_dict */
8955 0, /* tp_descr_get */
8956 0, /* tp_descr_set */
8957 0, /* tp_dictoffset */
8958 0, /* tp_init */
8959 0, /* tp_alloc */
8960 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008961 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962};
8963
8964/* Initialize the Unicode implementation */
8965
Thomas Wouters78890102000-07-22 19:25:51 +00008966void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008968 int i;
8969
Thomas Wouters477c8d52006-05-27 19:21:47 +00008970 /* XXX - move this array to unicodectype.c ? */
8971 Py_UNICODE linebreak[] = {
8972 0x000A, /* LINE FEED */
8973 0x000D, /* CARRIAGE RETURN */
8974 0x001C, /* FILE SEPARATOR */
8975 0x001D, /* GROUP SEPARATOR */
8976 0x001E, /* RECORD SEPARATOR */
8977 0x0085, /* NEXT LINE */
8978 0x2028, /* LINE SEPARATOR */
8979 0x2029, /* PARAGRAPH SEPARATOR */
8980 };
8981
Fred Drakee4315f52000-05-09 19:53:39 +00008982 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008983 unicode_freelist = NULL;
8984 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008986 if (!unicode_empty)
8987 return;
8988
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008989 for (i = 0; i < 256; i++)
8990 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008991 if (PyType_Ready(&PyUnicode_Type) < 0)
8992 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008993
8994 /* initialize the linebreak bloom filter */
8995 bloom_linebreak = make_bloom_mask(
8996 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8997 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008998
8999 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000}
9001
9002/* Finalize the Unicode implementation */
9003
9004void
Thomas Wouters78890102000-07-22 19:25:51 +00009005_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009007 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009008 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009010 Py_XDECREF(unicode_empty);
9011 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009012
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009013 for (i = 0; i < 256; i++) {
9014 if (unicode_latin1[i]) {
9015 Py_DECREF(unicode_latin1[i]);
9016 unicode_latin1[i] = NULL;
9017 }
9018 }
9019
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009020 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021 PyUnicodeObject *v = u;
9022 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00009023 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00009024 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00009025 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009026 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009028 unicode_freelist = NULL;
9029 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009031
Walter Dörwald16807132007-05-25 13:52:07 +00009032void
9033PyUnicode_InternInPlace(PyObject **p)
9034{
9035 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9036 PyObject *t;
9037 if (s == NULL || !PyUnicode_Check(s))
9038 Py_FatalError(
9039 "PyUnicode_InternInPlace: unicode strings only please!");
9040 /* If it's a subclass, we don't really know what putting
9041 it in the interned dict might do. */
9042 if (!PyUnicode_CheckExact(s))
9043 return;
9044 if (PyUnicode_CHECK_INTERNED(s))
9045 return;
9046 if (interned == NULL) {
9047 interned = PyDict_New();
9048 if (interned == NULL) {
9049 PyErr_Clear(); /* Don't leave an exception */
9050 return;
9051 }
9052 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009053 /* It might be that the GetItem call fails even
9054 though the key is present in the dictionary,
9055 namely when this happens during a stack overflow. */
9056 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009057 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009058 Py_END_ALLOW_RECURSION
9059
Walter Dörwald16807132007-05-25 13:52:07 +00009060 if (t) {
9061 Py_INCREF(t);
9062 Py_DECREF(*p);
9063 *p = t;
9064 return;
9065 }
9066
Martin v. Löwis5b222132007-06-10 09:51:05 +00009067 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009068 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9069 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009070 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009071 return;
9072 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009073 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009074 /* The two references in interned are not counted by refcnt.
9075 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009076 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009077 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9078}
9079
9080void
9081PyUnicode_InternImmortal(PyObject **p)
9082{
9083 PyUnicode_InternInPlace(p);
9084 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9085 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9086 Py_INCREF(*p);
9087 }
9088}
9089
9090PyObject *
9091PyUnicode_InternFromString(const char *cp)
9092{
9093 PyObject *s = PyUnicode_FromString(cp);
9094 if (s == NULL)
9095 return NULL;
9096 PyUnicode_InternInPlace(&s);
9097 return s;
9098}
9099
9100void _Py_ReleaseInternedUnicodeStrings(void)
9101{
9102 PyObject *keys;
9103 PyUnicodeObject *s;
9104 Py_ssize_t i, n;
9105 Py_ssize_t immortal_size = 0, mortal_size = 0;
9106
9107 if (interned == NULL || !PyDict_Check(interned))
9108 return;
9109 keys = PyDict_Keys(interned);
9110 if (keys == NULL || !PyList_Check(keys)) {
9111 PyErr_Clear();
9112 return;
9113 }
9114
9115 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9116 detector, interned unicode strings are not forcibly deallocated;
9117 rather, we give them their stolen references back, and then clear
9118 and DECREF the interned dict. */
9119
9120 n = PyList_GET_SIZE(keys);
9121 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9122 n);
9123 for (i = 0; i < n; i++) {
9124 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9125 switch (s->state) {
9126 case SSTATE_NOT_INTERNED:
9127 /* XXX Shouldn't happen */
9128 break;
9129 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009130 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009131 immortal_size += s->length;
9132 break;
9133 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009134 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009135 mortal_size += s->length;
9136 break;
9137 default:
9138 Py_FatalError("Inconsistent interned string state.");
9139 }
9140 s->state = SSTATE_NOT_INTERNED;
9141 }
9142 fprintf(stderr, "total size of all interned strings: "
9143 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9144 "mortal/immortal\n", mortal_size, immortal_size);
9145 Py_DECREF(keys);
9146 PyDict_Clear(interned);
9147 Py_DECREF(interned);
9148 interned = NULL;
9149}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009150
9151
9152/********************* Unicode Iterator **************************/
9153
9154typedef struct {
9155 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009156 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009157 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9158} unicodeiterobject;
9159
9160static void
9161unicodeiter_dealloc(unicodeiterobject *it)
9162{
9163 _PyObject_GC_UNTRACK(it);
9164 Py_XDECREF(it->it_seq);
9165 PyObject_GC_Del(it);
9166}
9167
9168static int
9169unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9170{
9171 Py_VISIT(it->it_seq);
9172 return 0;
9173}
9174
9175static PyObject *
9176unicodeiter_next(unicodeiterobject *it)
9177{
9178 PyUnicodeObject *seq;
9179 PyObject *item;
9180
9181 assert(it != NULL);
9182 seq = it->it_seq;
9183 if (seq == NULL)
9184 return NULL;
9185 assert(PyUnicode_Check(seq));
9186
9187 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009188 item = PyUnicode_FromUnicode(
9189 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009190 if (item != NULL)
9191 ++it->it_index;
9192 return item;
9193 }
9194
9195 Py_DECREF(seq);
9196 it->it_seq = NULL;
9197 return NULL;
9198}
9199
9200static PyObject *
9201unicodeiter_len(unicodeiterobject *it)
9202{
9203 Py_ssize_t len = 0;
9204 if (it->it_seq)
9205 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9206 return PyInt_FromSsize_t(len);
9207}
9208
9209PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9210
9211static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009212 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9213 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009214 {NULL, NULL} /* sentinel */
9215};
9216
9217PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009218 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009219 "unicodeiterator", /* tp_name */
9220 sizeof(unicodeiterobject), /* tp_basicsize */
9221 0, /* tp_itemsize */
9222 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009223 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009224 0, /* tp_print */
9225 0, /* tp_getattr */
9226 0, /* tp_setattr */
9227 0, /* tp_compare */
9228 0, /* tp_repr */
9229 0, /* tp_as_number */
9230 0, /* tp_as_sequence */
9231 0, /* tp_as_mapping */
9232 0, /* tp_hash */
9233 0, /* tp_call */
9234 0, /* tp_str */
9235 PyObject_GenericGetAttr, /* tp_getattro */
9236 0, /* tp_setattro */
9237 0, /* tp_as_buffer */
9238 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9239 0, /* tp_doc */
9240 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9241 0, /* tp_clear */
9242 0, /* tp_richcompare */
9243 0, /* tp_weaklistoffset */
9244 PyObject_SelfIter, /* tp_iter */
9245 (iternextfunc)unicodeiter_next, /* tp_iternext */
9246 unicodeiter_methods, /* tp_methods */
9247 0,
9248};
9249
9250static PyObject *
9251unicode_iter(PyObject *seq)
9252{
9253 unicodeiterobject *it;
9254
9255 if (!PyUnicode_Check(seq)) {
9256 PyErr_BadInternalCall();
9257 return NULL;
9258 }
9259 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9260 if (it == NULL)
9261 return NULL;
9262 it->it_index = 0;
9263 Py_INCREF(seq);
9264 it->it_seq = (PyUnicodeObject *)seq;
9265 _PyObject_GC_TRACK(it);
9266 return (PyObject *)it;
9267}
9268
Martin v. Löwis5b222132007-06-10 09:51:05 +00009269size_t
9270Py_UNICODE_strlen(const Py_UNICODE *u)
9271{
9272 int res = 0;
9273 while(*u++)
9274 res++;
9275 return res;
9276}
9277
9278Py_UNICODE*
9279Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9280{
9281 Py_UNICODE *u = s1;
9282 while ((*u++ = *s2++));
9283 return s1;
9284}
9285
9286Py_UNICODE*
9287Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9288{
9289 Py_UNICODE *u = s1;
9290 while ((*u++ = *s2++))
9291 if (n-- == 0)
9292 break;
9293 return s1;
9294}
9295
9296int
9297Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9298{
9299 while (*s1 && *s2 && *s1 == *s2)
9300 s1++, s2++;
9301 if (*s1 && *s2)
9302 return (*s1 < *s2) ? -1 : +1;
9303 if (*s1)
9304 return 1;
9305 if (*s2)
9306 return -1;
9307 return 0;
9308}
9309
9310Py_UNICODE*
9311Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9312{
9313 const Py_UNICODE *p;
9314 for (p = s; *p; p++)
9315 if (*p == c)
9316 return (Py_UNICODE*)p;
9317 return NULL;
9318}
9319
9320
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009321#ifdef __cplusplus
9322}
9323#endif
9324
9325
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009326/*
9327Local variables:
9328c-basic-offset: 4
9329indent-tabs-mode: nil
9330End:
9331*/