blob: 5ee3347b0e16df502871e6f160d87452a7a83a3b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000230 Ux0000 terminated; some code (e.g. new_identifier)
231 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232
233 XXX This allocator could further be enhanced by assuring that the
234 free list never reduces its size below 1.
235
236*/
237
238static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000239PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240{
241 register PyUnicodeObject *unicode;
242
Thomas Wouters477c8d52006-05-27 19:21:47 +0000243 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 if (length == 0 && unicode_empty != NULL) {
245 Py_INCREF(unicode_empty);
246 return unicode_empty;
247 }
248
249 /* Unicode freelist & memory allocation */
250 if (unicode_freelist) {
251 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000252 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 /* Keep-Alive optimization: we only upsize the buffer,
256 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000257 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000258 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000259 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000260 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 }
262 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000263 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000265 }
266 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000269 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 if (unicode == NULL)
271 return NULL;
272 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
273 }
274
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000275 if (!unicode->str) {
276 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000277 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000278 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000280 * the caller fails before initializing str -- unicode_resize()
281 * reads str[0], and the Keep-Alive optimization can keep memory
282 * allocated for str alive across a call to unicode_dealloc(unicode).
283 * We don't want unicode_resize to read uninitialized memory in
284 * that case.
285 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000286 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000288 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000290 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000291 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000293
294 onError:
295 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000296 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000297 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298}
299
300static
Guido van Rossum9475a232001-10-05 20:51:39 +0000301void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302{
Walter Dörwald16807132007-05-25 13:52:07 +0000303 switch (PyUnicode_CHECK_INTERNED(unicode)) {
304 case SSTATE_NOT_INTERNED:
305 break;
306
307 case SSTATE_INTERNED_MORTAL:
308 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000309 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000310 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
311 Py_FatalError(
312 "deletion of interned unicode string failed");
313 break;
314
315 case SSTATE_INTERNED_IMMORTAL:
316 Py_FatalError("Immortal interned unicode string died.");
317
318 default:
319 Py_FatalError("Inconsistent interned unicode string state.");
320 }
321
Guido van Rossum604ddf82001-12-06 20:03:56 +0000322 if (PyUnicode_CheckExact(unicode) &&
323 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000324 /* Keep-Alive optimization */
325 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000326 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327 unicode->str = NULL;
328 unicode->length = 0;
329 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000330 if (unicode->defenc) {
331 Py_DECREF(unicode->defenc);
332 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000333 }
334 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000335 *(PyUnicodeObject **)unicode = unicode_freelist;
336 unicode_freelist = unicode;
337 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000340 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000341 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000342 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 }
344}
345
Martin v. Löwis18e16552006-02-15 17:27:45 +0000346int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000347{
348 register PyUnicodeObject *v;
349
350 /* Argument checks */
351 if (unicode == NULL) {
352 PyErr_BadInternalCall();
353 return -1;
354 }
355 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000356 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000357 PyErr_BadInternalCall();
358 return -1;
359 }
360
361 /* Resizing unicode_empty and single character objects is not
362 possible since these are being shared. We simply return a fresh
363 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000364 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000365 (v == unicode_empty || v->length == 1)) {
366 PyUnicodeObject *w = _PyUnicode_New(length);
367 if (w == NULL)
368 return -1;
369 Py_UNICODE_COPY(w->str, v->str,
370 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000371 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000372 *unicode = (PyObject *)w;
373 return 0;
374 }
375
376 /* Note that we don't have to modify *unicode for unshared Unicode
377 objects, since we can modify them in-place. */
378 return unicode_resize(v, length);
379}
380
381/* Internal API for use in unicodeobject.c only ! */
382#define _PyUnicode_Resize(unicodevar, length) \
383 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000386 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387{
388 PyUnicodeObject *unicode;
389
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000390 /* If the Unicode data is known at construction time, we can apply
391 some optimizations which share commonly used objects. */
392 if (u != NULL) {
393
394 /* Optimization for empty strings */
395 if (size == 0 && unicode_empty != NULL) {
396 Py_INCREF(unicode_empty);
397 return (PyObject *)unicode_empty;
398 }
399
400 /* Single character Unicode objects in the Latin-1 range are
401 shared when using this constructor */
402 if (size == 1 && *u < 256) {
403 unicode = unicode_latin1[*u];
404 if (!unicode) {
405 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000406 if (!unicode)
407 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000408 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000409 unicode_latin1[*u] = unicode;
410 }
411 Py_INCREF(unicode);
412 return (PyObject *)unicode;
413 }
414 }
Tim Petersced69f82003-09-16 20:30:58 +0000415
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 unicode = _PyUnicode_New(size);
417 if (!unicode)
418 return NULL;
419
420 /* Copy the Unicode data into the new object */
421 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423
424 return (PyObject *)unicode;
425}
426
Walter Dörwaldd2034312007-05-18 16:29:38 +0000427PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000428{
429 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000430 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000431 some optimizations which share commonly used objects.
432 Also, this means the input must be UTF-8, so fall back to the
433 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000434 if (u != NULL) {
435
436 /* Optimization for empty strings */
437 if (size == 0 && unicode_empty != NULL) {
438 Py_INCREF(unicode_empty);
439 return (PyObject *)unicode_empty;
440 }
441
Martin v. Löwis9c121062007-08-05 20:26:11 +0000442 /* Single characters are shared when using this constructor.
443 Restrict to ASCII, since the input must be UTF-8. */
444 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000445 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000446 if (!unicode) {
447 unicode = _PyUnicode_New(1);
448 if (!unicode)
449 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000450 unicode->str[0] = Py_CHARMASK(*u);
451 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000452 }
453 Py_INCREF(unicode);
454 return (PyObject *)unicode;
455 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000456
457 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000458 }
459
Walter Dörwald55507312007-05-18 13:12:10 +0000460 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000461 if (!unicode)
462 return NULL;
463
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000464 return (PyObject *)unicode;
465}
466
Walter Dörwaldd2034312007-05-18 16:29:38 +0000467PyObject *PyUnicode_FromString(const char *u)
468{
469 size_t size = strlen(u);
470 if (size > PY_SSIZE_T_MAX) {
471 PyErr_SetString(PyExc_OverflowError, "input too long");
472 return NULL;
473 }
474
475 return PyUnicode_FromStringAndSize(u, size);
476}
477
Guido van Rossumd57fd912000-03-10 22:53:23 +0000478#ifdef HAVE_WCHAR_H
479
480PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000481 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482{
483 PyUnicodeObject *unicode;
484
485 if (w == NULL) {
486 PyErr_BadInternalCall();
487 return NULL;
488 }
489
490 unicode = _PyUnicode_New(size);
491 if (!unicode)
492 return NULL;
493
494 /* Copy the wchar_t data into the new object */
495#ifdef HAVE_USABLE_WCHAR_T
496 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000497#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 {
499 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000500 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000502 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 *u++ = *w++;
504 }
505#endif
506
507 return (PyObject *)unicode;
508}
509
Walter Dörwald346737f2007-05-31 10:44:43 +0000510static void
511makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
512{
513 *fmt++ = '%';
514 if (width) {
515 if (zeropad)
516 *fmt++ = '0';
517 fmt += sprintf(fmt, "%d", width);
518 }
519 if (precision)
520 fmt += sprintf(fmt, ".%d", precision);
521 if (longflag)
522 *fmt++ = 'l';
523 else if (size_tflag) {
524 char *f = PY_FORMAT_SIZE_T;
525 while (*f)
526 *fmt++ = *f++;
527 }
528 *fmt++ = c;
529 *fmt = '\0';
530}
531
Walter Dörwaldd2034312007-05-18 16:29:38 +0000532#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
533
534PyObject *
535PyUnicode_FromFormatV(const char *format, va_list vargs)
536{
537 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000538 Py_ssize_t callcount = 0;
539 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000540 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000541 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000542 int width = 0;
543 int precision = 0;
544 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000545 const char* f;
546 Py_UNICODE *s;
547 PyObject *string;
548 /* used by sprintf */
549 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000550 /* use abuffer instead of buffer, if we need more space
551 * (which can happen if there's a format specifier with width). */
552 char *abuffer = NULL;
553 char *realbuffer;
554 Py_ssize_t abuffersize = 0;
555 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000556 const char *copy;
557
558#ifdef VA_LIST_IS_ARRAY
559 Py_MEMCPY(count, vargs, sizeof(va_list));
560#else
561#ifdef __va_copy
562 __va_copy(count, vargs);
563#else
564 count = vargs;
565#endif
566#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000567 /* step 1: count the number of %S/%R format specifications
568 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
569 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000570 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000571 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000572 ++callcount;
573 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 /* step 2: allocate memory for the results of
575 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000576 if (callcount) {
577 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
578 if (!callresults) {
579 PyErr_NoMemory();
580 return NULL;
581 }
582 callresult = callresults;
583 }
584 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000585 for (f = format; *f; f++) {
586 if (*f == '%') {
587 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000588 width = 0;
589 while (isdigit(Py_CHARMASK(*f)))
590 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000591 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
592 ;
593
594 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
595 * they don't affect the amount of space we reserve.
596 */
597 if ((*f == 'l' || *f == 'z') &&
598 (f[1] == 'd' || f[1] == 'u'))
599 ++f;
600
601 switch (*f) {
602 case 'c':
603 (void)va_arg(count, int);
604 /* fall through... */
605 case '%':
606 n++;
607 break;
608 case 'd': case 'u': case 'i': case 'x':
609 (void) va_arg(count, int);
610 /* 20 bytes is enough to hold a 64-bit
611 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000612 This isn't enough for octal.
613 If a width is specified we need more
614 (which we allocate later). */
615 if (width < 20)
616 width = 20;
617 n += width;
618 if (abuffersize < width)
619 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000620 break;
621 case 's':
622 n += strlen(va_arg(count, char*));
623 break;
624 case 'U':
625 {
626 PyObject *obj = va_arg(count, PyObject *);
627 assert(obj && PyUnicode_Check(obj));
628 n += PyUnicode_GET_SIZE(obj);
629 break;
630 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000631 case 'V':
632 {
633 PyObject *obj = va_arg(count, PyObject *);
634 const char *str = va_arg(count, const char *);
635 assert(obj || str);
636 assert(!obj || PyUnicode_Check(obj));
637 if (obj)
638 n += PyUnicode_GET_SIZE(obj);
639 else
640 n += strlen(str);
641 break;
642 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000643 case 'S':
644 {
645 PyObject *obj = va_arg(count, PyObject *);
646 PyObject *str;
647 assert(obj);
648 str = PyObject_Unicode(obj);
649 if (!str)
650 goto fail;
651 n += PyUnicode_GET_SIZE(str);
652 /* Remember the str and switch to the next slot */
653 *callresult++ = str;
654 break;
655 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000656 case 'R':
657 {
658 PyObject *obj = va_arg(count, PyObject *);
659 PyObject *repr;
660 assert(obj);
661 repr = PyObject_Repr(obj);
662 if (!repr)
663 goto fail;
664 n += PyUnicode_GET_SIZE(repr);
665 /* Remember the repr and switch to the next slot */
666 *callresult++ = repr;
667 break;
668 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000669 case 'p':
670 (void) va_arg(count, int);
671 /* maximum 64-bit pointer representation:
672 * 0xffffffffffffffff
673 * so 19 characters is enough.
674 * XXX I count 18 -- what's the extra for?
675 */
676 n += 19;
677 break;
678 default:
679 /* if we stumble upon an unknown
680 formatting code, copy the rest of
681 the format string to the output
682 string. (we cannot just skip the
683 code, since there's no way to know
684 what's in the argument list) */
685 n += strlen(p);
686 goto expand;
687 }
688 } else
689 n++;
690 }
691 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000692 if (abuffersize > 20) {
693 abuffer = PyMem_Malloc(abuffersize);
694 if (!abuffer) {
695 PyErr_NoMemory();
696 goto fail;
697 }
698 realbuffer = abuffer;
699 }
700 else
701 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000702 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000703 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000704 we don't have to resize the string.
705 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000706 string = PyUnicode_FromUnicode(NULL, n);
707 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000708 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709
710 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000711 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000712
713 for (f = format; *f; f++) {
714 if (*f == '%') {
715 const char* p = f++;
716 int longflag = 0;
717 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000718 zeropad = (*f == '0');
719 /* parse the width.precision part */
720 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000721 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000722 width = (width*10) + *f++ - '0';
723 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724 if (*f == '.') {
725 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000726 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000727 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000729 /* handle the long flag, but only for %ld and %lu.
730 others can be added when necessary. */
731 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
732 longflag = 1;
733 ++f;
734 }
735 /* handle the size_t flag. */
736 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
737 size_tflag = 1;
738 ++f;
739 }
740
741 switch (*f) {
742 case 'c':
743 *s++ = va_arg(vargs, int);
744 break;
745 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000746 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000747 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000748 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000749 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000750 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000751 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000752 sprintf(realbuffer, fmt, va_arg(vargs, int));
753 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754 break;
755 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000756 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000758 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000760 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000762 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
763 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000764 break;
765 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000766 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
767 sprintf(realbuffer, fmt, va_arg(vargs, int));
768 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000769 break;
770 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000771 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
772 sprintf(realbuffer, fmt, va_arg(vargs, int));
773 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000774 break;
775 case 's':
776 p = va_arg(vargs, char*);
777 appendstring(p);
778 break;
779 case 'U':
780 {
781 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000782 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
783 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
784 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000785 break;
786 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000787 case 'V':
788 {
789 PyObject *obj = va_arg(vargs, PyObject *);
790 const char *str = va_arg(vargs, const char *);
791 if (obj) {
792 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
793 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
794 s += size;
795 } else {
796 appendstring(str);
797 }
798 break;
799 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000800 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000801 case 'R':
802 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000803 Py_UNICODE *ucopy;
804 Py_ssize_t usize;
805 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000806 /* unused, since we already have the result */
807 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000808 ucopy = PyUnicode_AS_UNICODE(*callresult);
809 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000810 for (upos = 0; upos<usize;)
811 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000812 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000813 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000814 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000815 ++callresult;
816 break;
817 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000818 case 'p':
819 sprintf(buffer, "%p", va_arg(vargs, void*));
820 /* %p is ill-defined: ensure leading 0x. */
821 if (buffer[1] == 'X')
822 buffer[1] = 'x';
823 else if (buffer[1] != 'x') {
824 memmove(buffer+2, buffer, strlen(buffer)+1);
825 buffer[0] = '0';
826 buffer[1] = 'x';
827 }
828 appendstring(buffer);
829 break;
830 case '%':
831 *s++ = '%';
832 break;
833 default:
834 appendstring(p);
835 goto end;
836 }
837 } else
838 *s++ = *f;
839 }
840
841 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000842 if (callresults)
843 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000844 if (abuffer)
845 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000846 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
847 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000848 fail:
849 if (callresults) {
850 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000851 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000852 Py_DECREF(*callresult2);
853 ++callresult2;
854 }
855 PyMem_Free(callresults);
856 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000857 if (abuffer)
858 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000859 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000860}
861
862#undef appendstring
863
864PyObject *
865PyUnicode_FromFormat(const char *format, ...)
866{
867 PyObject* ret;
868 va_list vargs;
869
870#ifdef HAVE_STDARG_PROTOTYPES
871 va_start(vargs, format);
872#else
873 va_start(vargs);
874#endif
875 ret = PyUnicode_FromFormatV(format, vargs);
876 va_end(vargs);
877 return ret;
878}
879
Martin v. Löwis18e16552006-02-15 17:27:45 +0000880Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
881 wchar_t *w,
882 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883{
884 if (unicode == NULL) {
885 PyErr_BadInternalCall();
886 return -1;
887 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000888
889 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000891 size = PyUnicode_GET_SIZE(unicode) + 1;
892
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893#ifdef HAVE_USABLE_WCHAR_T
894 memcpy(w, unicode->str, size * sizeof(wchar_t));
895#else
896 {
897 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000898 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000899 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000900 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000901 *w++ = *u++;
902 }
903#endif
904
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000905 if (size > PyUnicode_GET_SIZE(unicode))
906 return PyUnicode_GET_SIZE(unicode);
907 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000908 return size;
909}
910
911#endif
912
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000913PyObject *PyUnicode_FromOrdinal(int ordinal)
914{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000915 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000916
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000917 if (ordinal < 0 || ordinal > 0x10ffff) {
918 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000919 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000920 return NULL;
921 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000922
923#ifndef Py_UNICODE_WIDE
924 if (ordinal > 0xffff) {
925 ordinal -= 0x10000;
926 s[0] = 0xD800 | (ordinal >> 10);
927 s[1] = 0xDC00 | (ordinal & 0x3FF);
928 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000929 }
930#endif
931
Hye-Shik Chang40574832004-04-06 07:24:51 +0000932 s[0] = (Py_UNICODE)ordinal;
933 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000934}
935
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936PyObject *PyUnicode_FromObject(register PyObject *obj)
937{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000938 /* XXX Perhaps we should make this API an alias of
939 PyObject_Unicode() instead ?! */
940 if (PyUnicode_CheckExact(obj)) {
941 Py_INCREF(obj);
942 return obj;
943 }
944 if (PyUnicode_Check(obj)) {
945 /* For a Unicode subtype that's not a Unicode object,
946 return a true Unicode object with the same data. */
947 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
948 PyUnicode_GET_SIZE(obj));
949 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000950 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
951}
952
953PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
954 const char *encoding,
955 const char *errors)
956{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000957 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000958 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000959 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000960
Guido van Rossumd57fd912000-03-10 22:53:23 +0000961 if (obj == NULL) {
962 PyErr_BadInternalCall();
963 return NULL;
964 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000965
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000966#if 0
967 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000968 that no encodings is given and then redirect to
969 PyObject_Unicode() which then applies the additional logic for
970 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000971
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000972 NOTE: This API should really only be used for object which
973 represent *encoded* Unicode !
974
975 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000976 if (PyUnicode_Check(obj)) {
977 if (encoding) {
978 PyErr_SetString(PyExc_TypeError,
979 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000980 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000981 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000982 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000983 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984#else
985 if (PyUnicode_Check(obj)) {
986 PyErr_SetString(PyExc_TypeError,
987 "decoding Unicode is not supported");
988 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000989 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000990#endif
991
992 /* Coerce object */
993 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000994 s = PyString_AS_STRING(obj);
995 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000996 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000997 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
998 /* Overwrite the error message with something more useful in
999 case of a TypeError. */
1000 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001001 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001002 "coercing to Unicode: need string or buffer, "
1003 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001004 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001005 goto onError;
1006 }
Tim Petersced69f82003-09-16 20:30:58 +00001007
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001008 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001009 if (len == 0) {
1010 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001011 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001012 }
Tim Petersced69f82003-09-16 20:30:58 +00001013 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001014 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001015
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001016 return v;
1017
1018 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020}
1021
1022PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001023 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024 const char *encoding,
1025 const char *errors)
1026{
1027 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001028
1029 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001030 encoding = PyUnicode_GetDefaultEncoding();
1031
1032 /* Shortcuts for common default encodings */
1033 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001035 else if (strcmp(encoding, "latin-1") == 0)
1036 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001037#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1038 else if (strcmp(encoding, "mbcs") == 0)
1039 return PyUnicode_DecodeMBCS(s, size, errors);
1040#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001041 else if (strcmp(encoding, "ascii") == 0)
1042 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043
1044 /* Decode via the codec registry */
1045 buffer = PyBuffer_FromMemory((void *)s, size);
1046 if (buffer == NULL)
1047 goto onError;
1048 unicode = PyCodec_Decode(buffer, encoding, errors);
1049 if (unicode == NULL)
1050 goto onError;
1051 if (!PyUnicode_Check(unicode)) {
1052 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001053 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001054 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055 Py_DECREF(unicode);
1056 goto onError;
1057 }
1058 Py_DECREF(buffer);
1059 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001060
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061 onError:
1062 Py_XDECREF(buffer);
1063 return NULL;
1064}
1065
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001066PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1067 const char *encoding,
1068 const char *errors)
1069{
1070 PyObject *v;
1071
1072 if (!PyUnicode_Check(unicode)) {
1073 PyErr_BadArgument();
1074 goto onError;
1075 }
1076
1077 if (encoding == NULL)
1078 encoding = PyUnicode_GetDefaultEncoding();
1079
1080 /* Decode via the codec registry */
1081 v = PyCodec_Decode(unicode, encoding, errors);
1082 if (v == NULL)
1083 goto onError;
1084 return v;
1085
1086 onError:
1087 return NULL;
1088}
1089
Guido van Rossumd57fd912000-03-10 22:53:23 +00001090PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001091 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 const char *encoding,
1093 const char *errors)
1094{
1095 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001096
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 unicode = PyUnicode_FromUnicode(s, size);
1098 if (unicode == NULL)
1099 return NULL;
1100 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1101 Py_DECREF(unicode);
1102 return v;
1103}
1104
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001105PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1106 const char *encoding,
1107 const char *errors)
1108{
1109 PyObject *v;
1110
1111 if (!PyUnicode_Check(unicode)) {
1112 PyErr_BadArgument();
1113 goto onError;
1114 }
1115
1116 if (encoding == NULL)
1117 encoding = PyUnicode_GetDefaultEncoding();
1118
1119 /* Encode via the codec registry */
1120 v = PyCodec_Encode(unicode, encoding, errors);
1121 if (v == NULL)
1122 goto onError;
1123 return v;
1124
1125 onError:
1126 return NULL;
1127}
1128
Guido van Rossumd57fd912000-03-10 22:53:23 +00001129PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1130 const char *encoding,
1131 const char *errors)
1132{
1133 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001134
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135 if (!PyUnicode_Check(unicode)) {
1136 PyErr_BadArgument();
1137 goto onError;
1138 }
Fred Drakee4315f52000-05-09 19:53:39 +00001139
Tim Petersced69f82003-09-16 20:30:58 +00001140 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001141 encoding = PyUnicode_GetDefaultEncoding();
1142
1143 /* Shortcuts for common default encodings */
1144 if (errors == NULL) {
1145 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001146 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001147 else if (strcmp(encoding, "latin-1") == 0)
1148 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001149#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1150 else if (strcmp(encoding, "mbcs") == 0)
1151 return PyUnicode_AsMBCSString(unicode);
1152#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001153 else if (strcmp(encoding, "ascii") == 0)
1154 return PyUnicode_AsASCIIString(unicode);
1155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156
1157 /* Encode via the codec registry */
1158 v = PyCodec_Encode(unicode, encoding, errors);
1159 if (v == NULL)
1160 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001161 if (!PyBytes_Check(v)) {
1162 if (PyString_Check(v)) {
1163 /* Old codec, turn it into bytes */
1164 PyObject *b = PyBytes_FromObject(v);
1165 Py_DECREF(v);
1166 return b;
1167 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001169 "encoder did not return a bytes object "
1170 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1171 v->ob_type->tp_name,
1172 encoding ? encoding : "NULL",
1173 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 Py_DECREF(v);
1175 goto onError;
1176 }
1177 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001178
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 onError:
1180 return NULL;
1181}
1182
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001183PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1184 const char *errors)
1185{
1186 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001187 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001188 if (v)
1189 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001190 if (errors != NULL)
1191 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum06610092007-08-16 21:02:22 +00001192 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1193 PyUnicode_GET_SIZE(unicode),
1194 NULL);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001195 if (!b)
1196 return NULL;
1197 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1198 PyBytes_Size(b));
1199 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001200 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001201 return v;
1202}
1203
Martin v. Löwis5b222132007-06-10 09:51:05 +00001204char*
1205PyUnicode_AsString(PyObject *unicode)
1206{
1207 assert(PyUnicode_Check(unicode));
1208 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1209 if (!unicode)
1210 return NULL;
1211 return PyString_AsString(unicode);
1212}
1213
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1215{
1216 if (!PyUnicode_Check(unicode)) {
1217 PyErr_BadArgument();
1218 goto onError;
1219 }
1220 return PyUnicode_AS_UNICODE(unicode);
1221
1222 onError:
1223 return NULL;
1224}
1225
Martin v. Löwis18e16552006-02-15 17:27:45 +00001226Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227{
1228 if (!PyUnicode_Check(unicode)) {
1229 PyErr_BadArgument();
1230 goto onError;
1231 }
1232 return PyUnicode_GET_SIZE(unicode);
1233
1234 onError:
1235 return -1;
1236}
1237
Thomas Wouters78890102000-07-22 19:25:51 +00001238const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001239{
1240 return unicode_default_encoding;
1241}
1242
1243int PyUnicode_SetDefaultEncoding(const char *encoding)
1244{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001245 if (strcmp(encoding, unicode_default_encoding) != 0) {
1246 PyErr_Format(PyExc_ValueError,
1247 "Can only set default encoding to %s",
1248 unicode_default_encoding);
1249 return -1;
1250 }
Fred Drakee4315f52000-05-09 19:53:39 +00001251 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001252}
1253
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001254/* error handling callback helper:
1255 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001256 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001257 and adjust various state variables.
1258 return 0 on success, -1 on error
1259*/
1260
1261static
1262int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1263 const char *encoding, const char *reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001264 const char **input, const char **inend, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001265 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001266{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001267 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001268
1269 PyObject *restuple = NULL;
1270 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001271 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001272 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001273 Py_ssize_t requiredsize;
1274 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001275 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001276 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001277 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 int res = -1;
1279
1280 if (*errorHandler == NULL) {
1281 *errorHandler = PyCodec_LookupError(errors);
1282 if (*errorHandler == NULL)
1283 goto onError;
1284 }
1285
1286 if (*exceptionObject == NULL) {
1287 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001288 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001289 if (*exceptionObject == NULL)
1290 goto onError;
1291 }
1292 else {
1293 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1294 goto onError;
1295 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1296 goto onError;
1297 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1298 goto onError;
1299 }
1300
1301 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1302 if (restuple == NULL)
1303 goto onError;
1304 if (!PyTuple_Check(restuple)) {
1305 PyErr_Format(PyExc_TypeError, &argparse[4]);
1306 goto onError;
1307 }
1308 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1309 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001310
1311 /* Copy back the bytes variables, which might have been modified by the
1312 callback */
1313 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1314 if (!inputobj)
1315 goto onError;
1316 if (!PyBytes_Check(inputobj)) {
1317 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1318 }
1319 *input = PyBytes_AS_STRING(inputobj);
1320 insize = PyBytes_GET_SIZE(inputobj);
1321 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001322 /* we can DECREF safely, as the exception has another reference,
1323 so the object won't go away. */
1324 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001325
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001327 newpos = insize+newpos;
1328 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001329 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001330 goto onError;
1331 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001332
1333 /* need more space? (at least enough for what we
1334 have+the replacement+the rest of the string (starting
1335 at the new input position), so we won't have to check space
1336 when there are no errors in the rest of the string) */
1337 repptr = PyUnicode_AS_UNICODE(repunicode);
1338 repsize = PyUnicode_GET_SIZE(repunicode);
1339 requiredsize = *outpos + repsize + insize-newpos;
1340 if (requiredsize > outsize) {
1341 if (requiredsize<2*outsize)
1342 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001343 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 goto onError;
1345 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1346 }
1347 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001348 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001349 Py_UNICODE_COPY(*outptr, repptr, repsize);
1350 *outptr += repsize;
1351 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001352
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001353 /* we made it! */
1354 res = 0;
1355
1356 onError:
1357 Py_XDECREF(restuple);
1358 return res;
1359}
1360
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001361/* --- UTF-7 Codec -------------------------------------------------------- */
1362
1363/* see RFC2152 for details */
1364
Tim Petersced69f82003-09-16 20:30:58 +00001365static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001366char utf7_special[128] = {
1367 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1368 encoded:
1369 0 - not special
1370 1 - special
1371 2 - whitespace (optional)
1372 3 - RFC2152 Set O (optional) */
1373 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1374 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1375 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1376 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1377 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1378 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1379 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1380 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1381
1382};
1383
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001384/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1385 warnings about the comparison always being false; since
1386 utf7_special[0] is 1, we can safely make that one comparison
1387 true */
1388
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001389#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001390 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001391 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001392 (encodeO && (utf7_special[(c)] == 3)))
1393
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001394#define B64(n) \
1395 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1396#define B64CHAR(c) \
1397 (isalnum(c) || (c) == '+' || (c) == '/')
1398#define UB64(c) \
1399 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1400 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001401
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001402#define ENCODE(out, ch, bits) \
1403 while (bits >= 6) { \
1404 *out++ = B64(ch >> (bits-6)); \
1405 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001406 }
1407
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001408#define DECODE(out, ch, bits, surrogate) \
1409 while (bits >= 16) { \
1410 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1411 bits -= 16; \
1412 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001413 /* We have already generated an error for the high surrogate \
1414 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001415 surrogate = 0; \
1416 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001417 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001418 it in a 16-bit character */ \
1419 surrogate = 1; \
1420 errmsg = "code pairs are not supported"; \
1421 goto utf7Error; \
1422 } else { \
1423 *out++ = outCh; \
1424 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001425 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001426
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001427PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001428 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001429 const char *errors)
1430{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001431 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001432 Py_ssize_t startinpos;
1433 Py_ssize_t endinpos;
1434 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001435 const char *e;
1436 PyUnicodeObject *unicode;
1437 Py_UNICODE *p;
1438 const char *errmsg = "";
1439 int inShift = 0;
1440 unsigned int bitsleft = 0;
1441 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001442 int surrogate = 0;
1443 PyObject *errorHandler = NULL;
1444 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001445
1446 unicode = _PyUnicode_New(size);
1447 if (!unicode)
1448 return NULL;
1449 if (size == 0)
1450 return (PyObject *)unicode;
1451
1452 p = unicode->str;
1453 e = s + size;
1454
1455 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001456 Py_UNICODE ch;
1457 restart:
1458 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001459
1460 if (inShift) {
1461 if ((ch == '-') || !B64CHAR(ch)) {
1462 inShift = 0;
1463 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001464
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001465 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1466 if (bitsleft >= 6) {
1467 /* The shift sequence has a partial character in it. If
1468 bitsleft < 6 then we could just classify it as padding
1469 but that is not the case here */
1470
1471 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001472 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001473 }
1474 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001475 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001476 here so indicate the potential of a misencoded character. */
1477
1478 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1479 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1480 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001481 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001482 }
1483
1484 if (ch == '-') {
1485 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001486 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001487 inShift = 1;
1488 }
1489 } else if (SPECIAL(ch,0,0)) {
1490 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001491 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001492 } else {
1493 *p++ = ch;
1494 }
1495 } else {
1496 charsleft = (charsleft << 6) | UB64(ch);
1497 bitsleft += 6;
1498 s++;
1499 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1500 }
1501 }
1502 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001503 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001504 s++;
1505 if (s < e && *s == '-') {
1506 s++;
1507 *p++ = '+';
1508 } else
1509 {
1510 inShift = 1;
1511 bitsleft = 0;
1512 }
1513 }
1514 else if (SPECIAL(ch,0,0)) {
1515 errmsg = "unexpected special character";
1516 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001517 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001518 }
1519 else {
1520 *p++ = ch;
1521 s++;
1522 }
1523 continue;
1524 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 outpos = p-PyUnicode_AS_UNICODE(unicode);
1526 endinpos = s-starts;
1527 if (unicode_decode_call_errorhandler(
1528 errors, &errorHandler,
1529 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001530 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001531 (PyObject **)&unicode, &outpos, &p))
1532 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001533 }
1534
1535 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001536 outpos = p-PyUnicode_AS_UNICODE(unicode);
1537 endinpos = size;
1538 if (unicode_decode_call_errorhandler(
1539 errors, &errorHandler,
1540 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001541 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001542 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001543 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001544 if (s < e)
1545 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546 }
1547
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001548 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549 goto onError;
1550
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001551 Py_XDECREF(errorHandler);
1552 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553 return (PyObject *)unicode;
1554
1555onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001556 Py_XDECREF(errorHandler);
1557 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558 Py_DECREF(unicode);
1559 return NULL;
1560}
1561
1562
1563PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001564 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001565 int encodeSetO,
1566 int encodeWhiteSpace,
1567 const char *errors)
1568{
1569 PyObject *v;
1570 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001571 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001572 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001573 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001574 unsigned int bitsleft = 0;
1575 unsigned long charsleft = 0;
1576 char * out;
1577 char * start;
1578
1579 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001580 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581
Walter Dörwald51ab4142007-05-05 14:43:36 +00001582 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 if (v == NULL)
1584 return NULL;
1585
Walter Dörwald51ab4142007-05-05 14:43:36 +00001586 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001587 for (;i < size; ++i) {
1588 Py_UNICODE ch = s[i];
1589
1590 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001591 if (ch == '+') {
1592 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593 *out++ = '-';
1594 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1595 charsleft = ch;
1596 bitsleft = 16;
1597 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001598 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001599 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001600 } else {
1601 *out++ = (char) ch;
1602 }
1603 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1605 *out++ = B64(charsleft << (6-bitsleft));
1606 charsleft = 0;
1607 bitsleft = 0;
1608 /* Characters not in the BASE64 set implicitly unshift the sequence
1609 so no '-' is required, except if the character is itself a '-' */
1610 if (B64CHAR(ch) || ch == '-') {
1611 *out++ = '-';
1612 }
1613 inShift = 0;
1614 *out++ = (char) ch;
1615 } else {
1616 bitsleft += 16;
1617 charsleft = (charsleft << 16) | ch;
1618 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1619
1620 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001621 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001622 or '-' then the shift sequence will be terminated implicitly and we
1623 don't have to insert a '-'. */
1624
1625 if (bitsleft == 0) {
1626 if (i + 1 < size) {
1627 Py_UNICODE ch2 = s[i+1];
1628
1629 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001630
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001631 } else if (B64CHAR(ch2) || ch2 == '-') {
1632 *out++ = '-';
1633 inShift = 0;
1634 } else {
1635 inShift = 0;
1636 }
1637
1638 }
1639 else {
1640 *out++ = '-';
1641 inShift = 0;
1642 }
1643 }
Tim Petersced69f82003-09-16 20:30:58 +00001644 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001645 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001646 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001647 if (bitsleft) {
1648 *out++= B64(charsleft << (6-bitsleft) );
1649 *out++ = '-';
1650 }
1651
Walter Dörwald51ab4142007-05-05 14:43:36 +00001652 if (PyBytes_Resize(v, out - start)) {
1653 Py_DECREF(v);
1654 return NULL;
1655 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001656 return v;
1657}
1658
1659#undef SPECIAL
1660#undef B64
1661#undef B64CHAR
1662#undef UB64
1663#undef ENCODE
1664#undef DECODE
1665
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666/* --- UTF-8 Codec -------------------------------------------------------- */
1667
Tim Petersced69f82003-09-16 20:30:58 +00001668static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669char utf8_code_length[256] = {
1670 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1671 illegal prefix. see RFC 2279 for details */
1672 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1673 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1674 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1675 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1676 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1678 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1679 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1680 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1681 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1682 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1683 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1684 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1685 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1686 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1687 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1688};
1689
Guido van Rossumd57fd912000-03-10 22:53:23 +00001690PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001691 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 const char *errors)
1693{
Walter Dörwald69652032004-09-07 20:24:22 +00001694 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1695}
1696
1697PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001698 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001699 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001700 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001701{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001702 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001704 Py_ssize_t startinpos;
1705 Py_ssize_t endinpos;
1706 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 const char *e;
1708 PyUnicodeObject *unicode;
1709 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001710 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001711 PyObject *errorHandler = NULL;
1712 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713
1714 /* Note: size will always be longer than the resulting Unicode
1715 character count */
1716 unicode = _PyUnicode_New(size);
1717 if (!unicode)
1718 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001719 if (size == 0) {
1720 if (consumed)
1721 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724
1725 /* Unpack UTF-8 encoded data */
1726 p = unicode->str;
1727 e = s + size;
1728
1729 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001730 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731
1732 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001733 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 s++;
1735 continue;
1736 }
1737
1738 n = utf8_code_length[ch];
1739
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001740 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001741 if (consumed)
1742 break;
1743 else {
1744 errmsg = "unexpected end of data";
1745 startinpos = s-starts;
1746 endinpos = size;
1747 goto utf8Error;
1748 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750
1751 switch (n) {
1752
1753 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001754 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001755 startinpos = s-starts;
1756 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001757 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758
1759 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001760 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 startinpos = s-starts;
1762 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001763 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764
1765 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001766 if ((s[1] & 0xc0) != 0x80) {
1767 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001768 startinpos = s-starts;
1769 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001770 goto utf8Error;
1771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001773 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001774 startinpos = s-starts;
1775 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001776 errmsg = "illegal encoding";
1777 goto utf8Error;
1778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001780 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001781 break;
1782
1783 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001784 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001785 (s[2] & 0xc0) != 0x80) {
1786 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001787 startinpos = s-starts;
1788 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001789 goto utf8Error;
1790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001792 if (ch < 0x0800) {
1793 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001794 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001795
1796 XXX For wide builds (UCS-4) we should probably try
1797 to recombine the surrogates into a single code
1798 unit.
1799 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001800 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801 startinpos = s-starts;
1802 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001803 goto utf8Error;
1804 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001806 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001807 break;
1808
1809 case 4:
1810 if ((s[1] & 0xc0) != 0x80 ||
1811 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001812 (s[3] & 0xc0) != 0x80) {
1813 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001814 startinpos = s-starts;
1815 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001816 goto utf8Error;
1817 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001818 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1819 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1820 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001821 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001822 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001823 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001824 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001825 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001826 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 startinpos = s-starts;
1828 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001829 goto utf8Error;
1830 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001831#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001832 *p++ = (Py_UNICODE)ch;
1833#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001834 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001835
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001836 /* translate from 10000..10FFFF to 0..FFFF */
1837 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001838
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001839 /* high surrogate = top 10 bits added to D800 */
1840 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001841
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001842 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001843 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001844#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 break;
1846
1847 default:
1848 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001849 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001850 startinpos = s-starts;
1851 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001852 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853 }
1854 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001855 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001856
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001857 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 outpos = p-PyUnicode_AS_UNICODE(unicode);
1859 if (unicode_decode_call_errorhandler(
1860 errors, &errorHandler,
1861 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001862 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 (PyObject **)&unicode, &outpos, &p))
1864 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 }
Walter Dörwald69652032004-09-07 20:24:22 +00001866 if (consumed)
1867 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868
1869 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001870 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 goto onError;
1872
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001873 Py_XDECREF(errorHandler);
1874 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 return (PyObject *)unicode;
1876
1877onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001878 Py_XDECREF(errorHandler);
1879 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880 Py_DECREF(unicode);
1881 return NULL;
1882}
1883
Tim Peters602f7402002-04-27 18:03:26 +00001884/* Allocation strategy: if the string is short, convert into a stack buffer
1885 and allocate exactly as much space needed at the end. Else allocate the
1886 maximum possible needed (4 result bytes per Unicode character), and return
1887 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001888*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001889PyObject *
1890PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001891 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001892 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001893{
Tim Peters602f7402002-04-27 18:03:26 +00001894#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001895
Martin v. Löwis18e16552006-02-15 17:27:45 +00001896 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001897 PyObject *v; /* result string object */
1898 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001899 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001900 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001901 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001902
Tim Peters602f7402002-04-27 18:03:26 +00001903 assert(s != NULL);
1904 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905
Tim Peters602f7402002-04-27 18:03:26 +00001906 if (size <= MAX_SHORT_UNICHARS) {
1907 /* Write into the stack buffer; nallocated can't overflow.
1908 * At the end, we'll allocate exactly as much heap space as it
1909 * turns out we need.
1910 */
1911 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1912 v = NULL; /* will allocate after we're done */
1913 p = stackbuf;
1914 }
1915 else {
1916 /* Overallocate on the heap, and give the excess back at the end. */
1917 nallocated = size * 4;
1918 if (nallocated / 4 != size) /* overflow! */
1919 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001920 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001921 if (v == NULL)
1922 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001923 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001924 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001925
Tim Peters602f7402002-04-27 18:03:26 +00001926 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001927 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001928
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001929 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001930 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001932
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001934 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001935 *p++ = (char)(0xc0 | (ch >> 6));
1936 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001937 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001938 else {
Tim Peters602f7402002-04-27 18:03:26 +00001939 /* Encode UCS2 Unicode ordinals */
1940 if (ch < 0x10000) {
1941 /* Special case: check for high surrogate */
1942 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1943 Py_UCS4 ch2 = s[i];
1944 /* Check for low surrogate and combine the two to
1945 form a UCS4 value */
1946 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001947 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001948 i++;
1949 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001950 }
Tim Peters602f7402002-04-27 18:03:26 +00001951 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001952 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001953 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001954 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1955 *p++ = (char)(0x80 | (ch & 0x3f));
1956 continue;
1957 }
1958encodeUCS4:
1959 /* Encode UCS4 Unicode ordinals */
1960 *p++ = (char)(0xf0 | (ch >> 18));
1961 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1962 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1963 *p++ = (char)(0x80 | (ch & 0x3f));
1964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001966
Tim Peters602f7402002-04-27 18:03:26 +00001967 if (v == NULL) {
1968 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001969 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001970 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001971 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001972 }
1973 else {
1974 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001975 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001976 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001977 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001980
Tim Peters602f7402002-04-27 18:03:26 +00001981#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982}
1983
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1985{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 if (!PyUnicode_Check(unicode)) {
1987 PyErr_BadArgument();
1988 return NULL;
1989 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001990 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1991 PyUnicode_GET_SIZE(unicode),
1992 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993}
1994
Walter Dörwald41980ca2007-08-16 21:55:45 +00001995/* --- UTF-32 Codec ------------------------------------------------------- */
1996
1997PyObject *
1998PyUnicode_DecodeUTF32(const char *s,
1999 Py_ssize_t size,
2000 const char *errors,
2001 int *byteorder)
2002{
2003 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2004}
2005
2006PyObject *
2007PyUnicode_DecodeUTF32Stateful(const char *s,
2008 Py_ssize_t size,
2009 const char *errors,
2010 int *byteorder,
2011 Py_ssize_t *consumed)
2012{
2013 const char *starts = s;
2014 Py_ssize_t startinpos;
2015 Py_ssize_t endinpos;
2016 Py_ssize_t outpos;
2017 PyUnicodeObject *unicode;
2018 Py_UNICODE *p;
2019#ifndef Py_UNICODE_WIDE
2020 int i, pairs;
2021#else
2022 const int pairs = 0;
2023#endif
2024 const unsigned char *q, *e;
2025 int bo = 0; /* assume native ordering by default */
2026 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002027 /* Offsets from q for retrieving bytes in the right order. */
2028#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2029 int iorder[] = {0, 1, 2, 3};
2030#else
2031 int iorder[] = {3, 2, 1, 0};
2032#endif
2033 PyObject *errorHandler = NULL;
2034 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002035 /* On narrow builds we split characters outside the BMP into two
2036 codepoints => count how much extra space we need. */
2037#ifndef Py_UNICODE_WIDE
2038 for (i = pairs = 0; i < size/4; i++)
2039 if (((Py_UCS4 *)s)[i] >= 0x10000)
2040 pairs++;
2041#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002042
2043 /* This might be one to much, because of a BOM */
2044 unicode = _PyUnicode_New((size+3)/4+pairs);
2045 if (!unicode)
2046 return NULL;
2047 if (size == 0)
2048 return (PyObject *)unicode;
2049
2050 /* Unpack UTF-32 encoded data */
2051 p = unicode->str;
2052 q = (unsigned char *)s;
2053 e = q + size;
2054
2055 if (byteorder)
2056 bo = *byteorder;
2057
2058 /* Check for BOM marks (U+FEFF) in the input and adjust current
2059 byte order setting accordingly. In native mode, the leading BOM
2060 mark is skipped, in all other modes, it is copied to the output
2061 stream as-is (giving a ZWNBSP character). */
2062 if (bo == 0) {
2063 if (size >= 4) {
2064 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2065 (q[iorder[1]] << 8) | q[iorder[0]];
2066#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2067 if (bom == 0x0000FEFF) {
2068 q += 4;
2069 bo = -1;
2070 }
2071 else if (bom == 0xFFFE0000) {
2072 q += 4;
2073 bo = 1;
2074 }
2075#else
2076 if (bom == 0x0000FEFF) {
2077 q += 4;
2078 bo = 1;
2079 }
2080 else if (bom == 0xFFFE0000) {
2081 q += 4;
2082 bo = -1;
2083 }
2084#endif
2085 }
2086 }
2087
2088 if (bo == -1) {
2089 /* force LE */
2090 iorder[0] = 0;
2091 iorder[1] = 1;
2092 iorder[2] = 2;
2093 iorder[3] = 3;
2094 }
2095 else if (bo == 1) {
2096 /* force BE */
2097 iorder[0] = 3;
2098 iorder[1] = 2;
2099 iorder[2] = 1;
2100 iorder[3] = 0;
2101 }
2102
2103 while (q < e) {
2104 Py_UCS4 ch;
2105 /* remaining bytes at the end? (size should be divisible by 4) */
2106 if (e-q<4) {
2107 if (consumed)
2108 break;
2109 errmsg = "truncated data";
2110 startinpos = ((const char *)q)-starts;
2111 endinpos = ((const char *)e)-starts;
2112 goto utf32Error;
2113 /* The remaining input chars are ignored if the callback
2114 chooses to skip the input */
2115 }
2116 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2117 (q[iorder[1]] << 8) | q[iorder[0]];
2118
2119 if (ch >= 0x110000)
2120 {
2121 errmsg = "codepoint not in range(0x110000)";
2122 startinpos = ((const char *)q)-starts;
2123 endinpos = startinpos+4;
2124 goto utf32Error;
2125 }
2126#ifndef Py_UNICODE_WIDE
2127 if (ch >= 0x10000)
2128 {
2129 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2130 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2131 }
2132 else
2133#endif
2134 *p++ = ch;
2135 q += 4;
2136 continue;
2137 utf32Error:
2138 outpos = p-PyUnicode_AS_UNICODE(unicode);
2139 if (unicode_decode_call_errorhandler(
2140 errors, &errorHandler,
2141 "utf32", errmsg,
2142 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2143 (PyObject **)&unicode, &outpos, &p))
2144 goto onError;
2145 }
2146
2147 if (byteorder)
2148 *byteorder = bo;
2149
2150 if (consumed)
2151 *consumed = (const char *)q-starts;
2152
2153 /* Adjust length */
2154 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2155 goto onError;
2156
2157 Py_XDECREF(errorHandler);
2158 Py_XDECREF(exc);
2159 return (PyObject *)unicode;
2160
2161onError:
2162 Py_DECREF(unicode);
2163 Py_XDECREF(errorHandler);
2164 Py_XDECREF(exc);
2165 return NULL;
2166}
2167
2168PyObject *
2169PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2170 Py_ssize_t size,
2171 const char *errors,
2172 int byteorder)
2173{
2174 PyObject *v;
2175 unsigned char *p;
2176#ifndef Py_UNICODE_WIDE
2177 int i, pairs;
2178#else
2179 const int pairs = 0;
2180#endif
2181 /* Offsets from p for storing byte pairs in the right order. */
2182#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2183 int iorder[] = {0, 1, 2, 3};
2184#else
2185 int iorder[] = {3, 2, 1, 0};
2186#endif
2187
2188#define STORECHAR(CH) \
2189 do { \
2190 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2191 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2192 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2193 p[iorder[0]] = (CH) & 0xff; \
2194 p += 4; \
2195 } while(0)
2196
2197 /* In narrow builds we can output surrogate pairs as one codepoint,
2198 so we need less space. */
2199#ifndef Py_UNICODE_WIDE
2200 for (i = pairs = 0; i < size-1; i++)
2201 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2202 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2203 pairs++;
2204#endif
2205 v = PyBytes_FromStringAndSize(NULL,
2206 4 * (size - pairs + (byteorder == 0)));
2207 if (v == NULL)
2208 return NULL;
2209
2210 p = (unsigned char *)PyBytes_AS_STRING(v);
2211 if (byteorder == 0)
2212 STORECHAR(0xFEFF);
2213 if (size == 0)
2214 return v;
2215
2216 if (byteorder == -1) {
2217 /* force LE */
2218 iorder[0] = 0;
2219 iorder[1] = 1;
2220 iorder[2] = 2;
2221 iorder[3] = 3;
2222 }
2223 else if (byteorder == 1) {
2224 /* force BE */
2225 iorder[0] = 3;
2226 iorder[1] = 2;
2227 iorder[2] = 1;
2228 iorder[3] = 0;
2229 }
2230
2231 while (size-- > 0) {
2232 Py_UCS4 ch = *s++;
2233#ifndef Py_UNICODE_WIDE
2234 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2235 Py_UCS4 ch2 = *s;
2236 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2237 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2238 s++;
2239 size--;
2240 }
2241 }
2242#endif
2243 STORECHAR(ch);
2244 }
2245 return v;
2246#undef STORECHAR
2247}
2248
2249PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2250{
2251 if (!PyUnicode_Check(unicode)) {
2252 PyErr_BadArgument();
2253 return NULL;
2254 }
2255 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2256 PyUnicode_GET_SIZE(unicode),
2257 NULL,
2258 0);
2259}
2260
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261/* --- UTF-16 Codec ------------------------------------------------------- */
2262
Tim Peters772747b2001-08-09 22:21:55 +00002263PyObject *
2264PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002265 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002266 const char *errors,
2267 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268{
Walter Dörwald69652032004-09-07 20:24:22 +00002269 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2270}
2271
2272PyObject *
2273PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002274 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002275 const char *errors,
2276 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002277 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002278{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002279 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002280 Py_ssize_t startinpos;
2281 Py_ssize_t endinpos;
2282 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283 PyUnicodeObject *unicode;
2284 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002285 const unsigned char *q, *e;
2286 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002287 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002288 /* Offsets from q for retrieving byte pairs in the right order. */
2289#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2290 int ihi = 1, ilo = 0;
2291#else
2292 int ihi = 0, ilo = 1;
2293#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002294 PyObject *errorHandler = NULL;
2295 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296
2297 /* Note: size will always be longer than the resulting Unicode
2298 character count */
2299 unicode = _PyUnicode_New(size);
2300 if (!unicode)
2301 return NULL;
2302 if (size == 0)
2303 return (PyObject *)unicode;
2304
2305 /* Unpack UTF-16 encoded data */
2306 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002307 q = (unsigned char *)s;
2308 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309
2310 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002311 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002313 /* Check for BOM marks (U+FEFF) in the input and adjust current
2314 byte order setting accordingly. In native mode, the leading BOM
2315 mark is skipped, in all other modes, it is copied to the output
2316 stream as-is (giving a ZWNBSP character). */
2317 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002318 if (size >= 2) {
2319 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002320#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002321 if (bom == 0xFEFF) {
2322 q += 2;
2323 bo = -1;
2324 }
2325 else if (bom == 0xFFFE) {
2326 q += 2;
2327 bo = 1;
2328 }
Tim Petersced69f82003-09-16 20:30:58 +00002329#else
Walter Dörwald69652032004-09-07 20:24:22 +00002330 if (bom == 0xFEFF) {
2331 q += 2;
2332 bo = 1;
2333 }
2334 else if (bom == 0xFFFE) {
2335 q += 2;
2336 bo = -1;
2337 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002338#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002339 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002340 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002341
Tim Peters772747b2001-08-09 22:21:55 +00002342 if (bo == -1) {
2343 /* force LE */
2344 ihi = 1;
2345 ilo = 0;
2346 }
2347 else if (bo == 1) {
2348 /* force BE */
2349 ihi = 0;
2350 ilo = 1;
2351 }
2352
2353 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002354 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002355 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002356 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002357 if (consumed)
2358 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002359 errmsg = "truncated data";
2360 startinpos = ((const char *)q)-starts;
2361 endinpos = ((const char *)e)-starts;
2362 goto utf16Error;
2363 /* The remaining input chars are ignored if the callback
2364 chooses to skip the input */
2365 }
2366 ch = (q[ihi] << 8) | q[ilo];
2367
Tim Peters772747b2001-08-09 22:21:55 +00002368 q += 2;
2369
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370 if (ch < 0xD800 || ch > 0xDFFF) {
2371 *p++ = ch;
2372 continue;
2373 }
2374
2375 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002376 if (q >= e) {
2377 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002378 startinpos = (((const char *)q)-2)-starts;
2379 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002380 goto utf16Error;
2381 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002382 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002383 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2384 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002385 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002386#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002387 *p++ = ch;
2388 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002389#else
2390 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002391#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002392 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002393 }
2394 else {
2395 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002396 startinpos = (((const char *)q)-4)-starts;
2397 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002398 goto utf16Error;
2399 }
2400
Guido van Rossumd57fd912000-03-10 22:53:23 +00002401 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002402 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002403 startinpos = (((const char *)q)-2)-starts;
2404 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002405 /* Fall through to report the error */
2406
2407 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002408 outpos = p-PyUnicode_AS_UNICODE(unicode);
2409 if (unicode_decode_call_errorhandler(
2410 errors, &errorHandler,
2411 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002412 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002413 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002414 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002415 }
2416
2417 if (byteorder)
2418 *byteorder = bo;
2419
Walter Dörwald69652032004-09-07 20:24:22 +00002420 if (consumed)
2421 *consumed = (const char *)q-starts;
2422
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002424 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002425 goto onError;
2426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002427 Py_XDECREF(errorHandler);
2428 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429 return (PyObject *)unicode;
2430
2431onError:
2432 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002433 Py_XDECREF(errorHandler);
2434 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435 return NULL;
2436}
2437
Tim Peters772747b2001-08-09 22:21:55 +00002438PyObject *
2439PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002440 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002441 const char *errors,
2442 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002443{
2444 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002445 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002446#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002447 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002448#else
2449 const int pairs = 0;
2450#endif
Tim Peters772747b2001-08-09 22:21:55 +00002451 /* Offsets from p for storing byte pairs in the right order. */
2452#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2453 int ihi = 1, ilo = 0;
2454#else
2455 int ihi = 0, ilo = 1;
2456#endif
2457
2458#define STORECHAR(CH) \
2459 do { \
2460 p[ihi] = ((CH) >> 8) & 0xff; \
2461 p[ilo] = (CH) & 0xff; \
2462 p += 2; \
2463 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002464
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002465#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002466 for (i = pairs = 0; i < size; i++)
2467 if (s[i] >= 0x10000)
2468 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002469#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002470 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002471 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 if (v == NULL)
2473 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474
Walter Dörwald3cc34522007-05-04 10:48:27 +00002475 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002477 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002478 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002479 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002480
2481 if (byteorder == -1) {
2482 /* force LE */
2483 ihi = 1;
2484 ilo = 0;
2485 }
2486 else if (byteorder == 1) {
2487 /* force BE */
2488 ihi = 0;
2489 ilo = 1;
2490 }
2491
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002492 while (size-- > 0) {
2493 Py_UNICODE ch = *s++;
2494 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002495#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002496 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002497 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2498 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002500#endif
Tim Peters772747b2001-08-09 22:21:55 +00002501 STORECHAR(ch);
2502 if (ch2)
2503 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002504 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002506#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507}
2508
2509PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2510{
2511 if (!PyUnicode_Check(unicode)) {
2512 PyErr_BadArgument();
2513 return NULL;
2514 }
2515 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2516 PyUnicode_GET_SIZE(unicode),
2517 NULL,
2518 0);
2519}
2520
2521/* --- Unicode Escape Codec ----------------------------------------------- */
2522
Fredrik Lundh06d12682001-01-24 07:59:11 +00002523static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002524
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002526 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 const char *errors)
2528{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002529 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002530 Py_ssize_t startinpos;
2531 Py_ssize_t endinpos;
2532 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002533 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002535 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002537 char* message;
2538 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002539 PyObject *errorHandler = NULL;
2540 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002541
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542 /* Escaped strings will always be longer than the resulting
2543 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 length after conversion to the true value.
2545 (but if the error callback returns a long replacement string
2546 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 v = _PyUnicode_New(size);
2548 if (v == NULL)
2549 goto onError;
2550 if (size == 0)
2551 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002552
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002553 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002555
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556 while (s < end) {
2557 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002558 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002559 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560
2561 /* Non-escape characters are interpreted as Unicode ordinals */
2562 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002563 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564 continue;
2565 }
2566
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002567 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568 /* \ - Escapes */
2569 s++;
2570 switch (*s++) {
2571
2572 /* \x escapes */
2573 case '\n': break;
2574 case '\\': *p++ = '\\'; break;
2575 case '\'': *p++ = '\''; break;
2576 case '\"': *p++ = '\"'; break;
2577 case 'b': *p++ = '\b'; break;
2578 case 'f': *p++ = '\014'; break; /* FF */
2579 case 't': *p++ = '\t'; break;
2580 case 'n': *p++ = '\n'; break;
2581 case 'r': *p++ = '\r'; break;
2582 case 'v': *p++ = '\013'; break; /* VT */
2583 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2584
2585 /* \OOO (octal) escapes */
2586 case '0': case '1': case '2': case '3':
2587 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002588 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002590 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002592 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002594 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595 break;
2596
Fredrik Lundhccc74732001-02-18 22:13:49 +00002597 /* hex escapes */
2598 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002600 digits = 2;
2601 message = "truncated \\xXX escape";
2602 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603
Fredrik Lundhccc74732001-02-18 22:13:49 +00002604 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002606 digits = 4;
2607 message = "truncated \\uXXXX escape";
2608 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609
Fredrik Lundhccc74732001-02-18 22:13:49 +00002610 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002611 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002612 digits = 8;
2613 message = "truncated \\UXXXXXXXX escape";
2614 hexescape:
2615 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002616 outpos = p-PyUnicode_AS_UNICODE(v);
2617 if (s+digits>end) {
2618 endinpos = size;
2619 if (unicode_decode_call_errorhandler(
2620 errors, &errorHandler,
2621 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002622 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002623 (PyObject **)&v, &outpos, &p))
2624 goto onError;
2625 goto nextByte;
2626 }
2627 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002628 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002629 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002630 endinpos = (s+i+1)-starts;
2631 if (unicode_decode_call_errorhandler(
2632 errors, &errorHandler,
2633 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002634 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002635 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002636 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002637 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002638 }
2639 chr = (chr<<4) & ~0xF;
2640 if (c >= '0' && c <= '9')
2641 chr += c - '0';
2642 else if (c >= 'a' && c <= 'f')
2643 chr += 10 + c - 'a';
2644 else
2645 chr += 10 + c - 'A';
2646 }
2647 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002648 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002649 /* _decoding_error will have already written into the
2650 target buffer. */
2651 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002652 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002653 /* when we get here, chr is a 32-bit unicode character */
2654 if (chr <= 0xffff)
2655 /* UCS-2 character */
2656 *p++ = (Py_UNICODE) chr;
2657 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002658 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002659 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002660#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002661 *p++ = chr;
2662#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002663 chr -= 0x10000L;
2664 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002665 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002666#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002667 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002668 endinpos = s-starts;
2669 outpos = p-PyUnicode_AS_UNICODE(v);
2670 if (unicode_decode_call_errorhandler(
2671 errors, &errorHandler,
2672 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002673 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002675 goto onError;
2676 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002677 break;
2678
2679 /* \N{name} */
2680 case 'N':
2681 message = "malformed \\N character escape";
2682 if (ucnhash_CAPI == NULL) {
2683 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002684 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002685 m = PyImport_ImportModule("unicodedata");
2686 if (m == NULL)
2687 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002688 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002689 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002690 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002691 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002692 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002693 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002694 if (ucnhash_CAPI == NULL)
2695 goto ucnhashError;
2696 }
2697 if (*s == '{') {
2698 const char *start = s+1;
2699 /* look for the closing brace */
2700 while (*s != '}' && s < end)
2701 s++;
2702 if (s > start && s < end && *s == '}') {
2703 /* found a name. look it up in the unicode database */
2704 message = "unknown Unicode character name";
2705 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002706 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002707 goto store;
2708 }
2709 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002710 endinpos = s-starts;
2711 outpos = p-PyUnicode_AS_UNICODE(v);
2712 if (unicode_decode_call_errorhandler(
2713 errors, &errorHandler,
2714 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002715 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002716 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002717 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002718 break;
2719
2720 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002721 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 message = "\\ at end of string";
2723 s--;
2724 endinpos = s-starts;
2725 outpos = p-PyUnicode_AS_UNICODE(v);
2726 if (unicode_decode_call_errorhandler(
2727 errors, &errorHandler,
2728 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002729 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002730 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002731 goto onError;
2732 }
2733 else {
2734 *p++ = '\\';
2735 *p++ = (unsigned char)s[-1];
2736 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002737 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002739 nextByte:
2740 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002742 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002744 Py_XDECREF(errorHandler);
2745 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002747
Fredrik Lundhccc74732001-02-18 22:13:49 +00002748ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002749 PyErr_SetString(
2750 PyExc_UnicodeError,
2751 "\\N escapes not supported (can't load unicodedata module)"
2752 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002753 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002754 Py_XDECREF(errorHandler);
2755 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002756 return NULL;
2757
Fredrik Lundhccc74732001-02-18 22:13:49 +00002758onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 Py_XDECREF(errorHandler);
2761 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 return NULL;
2763}
2764
2765/* Return a Unicode-Escape string version of the Unicode object.
2766
2767 If quotes is true, the string is enclosed in u"" or u'' quotes as
2768 appropriate.
2769
2770*/
2771
Thomas Wouters477c8d52006-05-27 19:21:47 +00002772Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2773 Py_ssize_t size,
2774 Py_UNICODE ch)
2775{
2776 /* like wcschr, but doesn't stop at NULL characters */
2777
2778 while (size-- > 0) {
2779 if (*s == ch)
2780 return s;
2781 s++;
2782 }
2783
2784 return NULL;
2785}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002786
Walter Dörwald79e913e2007-05-12 11:08:06 +00002787static const char *hexdigits = "0123456789abcdef";
2788
2789PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2790 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791{
2792 PyObject *repr;
2793 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794
Thomas Wouters89f507f2006-12-13 04:49:30 +00002795 /* XXX(nnorwitz): rather than over-allocating, it would be
2796 better to choose a different scheme. Perhaps scan the
2797 first N-chars of the string and allocate based on that size.
2798 */
2799 /* Initial allocation is based on the longest-possible unichr
2800 escape.
2801
2802 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2803 unichr, so in this case it's the longest unichr escape. In
2804 narrow (UTF-16) builds this is five chars per source unichr
2805 since there are two unichrs in the surrogate pair, so in narrow
2806 (UTF-16) builds it's not the longest unichr escape.
2807
2808 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2809 so in the narrow (UTF-16) build case it's the longest unichr
2810 escape.
2811 */
2812
Walter Dörwald79e913e2007-05-12 11:08:06 +00002813 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002814#ifdef Py_UNICODE_WIDE
2815 + 10*size
2816#else
2817 + 6*size
2818#endif
2819 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820 if (repr == NULL)
2821 return NULL;
2822
Walter Dörwald79e913e2007-05-12 11:08:06 +00002823 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825 while (size-- > 0) {
2826 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002827
Walter Dörwald79e913e2007-05-12 11:08:06 +00002828 /* Escape backslashes */
2829 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 *p++ = '\\';
2831 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002832 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002833 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002834
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002835#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002836 /* Map 21-bit characters to '\U00xxxxxx' */
2837 else if (ch >= 0x10000) {
2838 *p++ = '\\';
2839 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002840 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2841 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2842 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2843 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2844 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2845 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2846 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2847 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002848 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002849 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002850#else
2851 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002852 else if (ch >= 0xD800 && ch < 0xDC00) {
2853 Py_UNICODE ch2;
2854 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002855
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002856 ch2 = *s++;
2857 size--;
2858 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2859 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2860 *p++ = '\\';
2861 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002862 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2863 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2864 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2865 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2866 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2867 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2868 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2869 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002870 continue;
2871 }
2872 /* Fall through: isolated surrogates are copied as-is */
2873 s--;
2874 size++;
2875 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002876#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002877
Guido van Rossumd57fd912000-03-10 22:53:23 +00002878 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002879 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880 *p++ = '\\';
2881 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002882 *p++ = hexdigits[(ch >> 12) & 0x000F];
2883 *p++ = hexdigits[(ch >> 8) & 0x000F];
2884 *p++ = hexdigits[(ch >> 4) & 0x000F];
2885 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002887
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002888 /* Map special whitespace to '\t', \n', '\r' */
2889 else if (ch == '\t') {
2890 *p++ = '\\';
2891 *p++ = 't';
2892 }
2893 else if (ch == '\n') {
2894 *p++ = '\\';
2895 *p++ = 'n';
2896 }
2897 else if (ch == '\r') {
2898 *p++ = '\\';
2899 *p++ = 'r';
2900 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002901
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002902 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002903 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002905 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002906 *p++ = hexdigits[(ch >> 4) & 0x000F];
2907 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002908 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002909
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910 /* Copy everything else as-is */
2911 else
2912 *p++ = (char) ch;
2913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002914
2915 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002916 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2917 Py_DECREF(repr);
2918 return NULL;
2919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920 return repr;
2921}
2922
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2924{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002925 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 if (!PyUnicode_Check(unicode)) {
2927 PyErr_BadArgument();
2928 return NULL;
2929 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002930 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2931 PyUnicode_GET_SIZE(unicode));
2932
2933 if (!s)
2934 return NULL;
2935 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2936 PyBytes_GET_SIZE(s));
2937 Py_DECREF(s);
2938 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939}
2940
2941/* --- Raw Unicode Escape Codec ------------------------------------------- */
2942
2943PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002944 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945 const char *errors)
2946{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002947 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002948 Py_ssize_t startinpos;
2949 Py_ssize_t endinpos;
2950 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002952 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002953 const char *end;
2954 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002955 PyObject *errorHandler = NULL;
2956 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002957
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 /* Escaped strings will always be longer than the resulting
2959 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002960 length after conversion to the true value. (But decoding error
2961 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002962 v = _PyUnicode_New(size);
2963 if (v == NULL)
2964 goto onError;
2965 if (size == 0)
2966 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002967 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968 end = s + size;
2969 while (s < end) {
2970 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002971 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002973 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974
2975 /* Non-escape characters are interpreted as Unicode ordinals */
2976 if (*s != '\\') {
2977 *p++ = (unsigned char)*s++;
2978 continue;
2979 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002980 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981
2982 /* \u-escapes are only interpreted iff the number of leading
2983 backslashes if odd */
2984 bs = s;
2985 for (;s < end;) {
2986 if (*s != '\\')
2987 break;
2988 *p++ = (unsigned char)*s++;
2989 }
2990 if (((s - bs) & 1) == 0 ||
2991 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002992 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 continue;
2994 }
2995 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002996 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 s++;
2998
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002999 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003001 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003002 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003004 endinpos = s-starts;
3005 if (unicode_decode_call_errorhandler(
3006 errors, &errorHandler,
3007 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003008 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003009 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003010 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003011 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012 }
3013 x = (x<<4) & ~0xF;
3014 if (c >= '0' && c <= '9')
3015 x += c - '0';
3016 else if (c >= 'a' && c <= 'f')
3017 x += 10 + c - 'a';
3018 else
3019 x += 10 + c - 'A';
3020 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003021#ifndef Py_UNICODE_WIDE
3022 if (x > 0x10000) {
3023 if (unicode_decode_call_errorhandler(
3024 errors, &errorHandler,
3025 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003026 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003027 (PyObject **)&v, &outpos, &p))
3028 goto onError;
3029 }
3030#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 *p++ = x;
3032 nextByte:
3033 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003035 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003036 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003037 Py_XDECREF(errorHandler);
3038 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003040
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 onError:
3042 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003043 Py_XDECREF(errorHandler);
3044 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 return NULL;
3046}
3047
3048PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003049 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050{
3051 PyObject *repr;
3052 char *p;
3053 char *q;
3054
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003055#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00003056 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003057#else
Walter Dörwald711005d2007-05-12 12:03:26 +00003058 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003059#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 if (repr == NULL)
3061 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003062 if (size == 0)
3063 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064
Walter Dörwald711005d2007-05-12 12:03:26 +00003065 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066 while (size-- > 0) {
3067 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003068#ifdef Py_UNICODE_WIDE
3069 /* Map 32-bit characters to '\Uxxxxxxxx' */
3070 if (ch >= 0x10000) {
3071 *p++ = '\\';
3072 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003073 *p++ = hexdigits[(ch >> 28) & 0xf];
3074 *p++ = hexdigits[(ch >> 24) & 0xf];
3075 *p++ = hexdigits[(ch >> 20) & 0xf];
3076 *p++ = hexdigits[(ch >> 16) & 0xf];
3077 *p++ = hexdigits[(ch >> 12) & 0xf];
3078 *p++ = hexdigits[(ch >> 8) & 0xf];
3079 *p++ = hexdigits[(ch >> 4) & 0xf];
3080 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003081 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003082 else
3083#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 /* Map 16-bit characters to '\uxxxx' */
3085 if (ch >= 256) {
3086 *p++ = '\\';
3087 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003088 *p++ = hexdigits[(ch >> 12) & 0xf];
3089 *p++ = hexdigits[(ch >> 8) & 0xf];
3090 *p++ = hexdigits[(ch >> 4) & 0xf];
3091 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092 }
3093 /* Copy everything else as-is */
3094 else
3095 *p++ = (char) ch;
3096 }
3097 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00003098 if (PyBytes_Resize(repr, p - q)) {
3099 Py_DECREF(repr);
3100 return NULL;
3101 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 return repr;
3103}
3104
3105PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3106{
Walter Dörwald711005d2007-05-12 12:03:26 +00003107 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003109 PyErr_BadArgument();
3110 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003112 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3113 PyUnicode_GET_SIZE(unicode));
3114
3115 if (!s)
3116 return NULL;
3117 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3118 PyBytes_GET_SIZE(s));
3119 Py_DECREF(s);
3120 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121}
3122
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003123/* --- Unicode Internal Codec ------------------------------------------- */
3124
3125PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003126 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003127 const char *errors)
3128{
3129 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003130 Py_ssize_t startinpos;
3131 Py_ssize_t endinpos;
3132 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003133 PyUnicodeObject *v;
3134 Py_UNICODE *p;
3135 const char *end;
3136 const char *reason;
3137 PyObject *errorHandler = NULL;
3138 PyObject *exc = NULL;
3139
Neal Norwitzd43069c2006-01-08 01:12:10 +00003140#ifdef Py_UNICODE_WIDE
3141 Py_UNICODE unimax = PyUnicode_GetMax();
3142#endif
3143
Thomas Wouters89f507f2006-12-13 04:49:30 +00003144 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003145 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3146 if (v == NULL)
3147 goto onError;
3148 if (PyUnicode_GetSize((PyObject *)v) == 0)
3149 return (PyObject *)v;
3150 p = PyUnicode_AS_UNICODE(v);
3151 end = s + size;
3152
3153 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003154 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003155 /* We have to sanity check the raw data, otherwise doom looms for
3156 some malformed UCS-4 data. */
3157 if (
3158 #ifdef Py_UNICODE_WIDE
3159 *p > unimax || *p < 0 ||
3160 #endif
3161 end-s < Py_UNICODE_SIZE
3162 )
3163 {
3164 startinpos = s - starts;
3165 if (end-s < Py_UNICODE_SIZE) {
3166 endinpos = end-starts;
3167 reason = "truncated input";
3168 }
3169 else {
3170 endinpos = s - starts + Py_UNICODE_SIZE;
3171 reason = "illegal code point (> 0x10FFFF)";
3172 }
3173 outpos = p - PyUnicode_AS_UNICODE(v);
3174 if (unicode_decode_call_errorhandler(
3175 errors, &errorHandler,
3176 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003177 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003178 (PyObject **)&v, &outpos, &p)) {
3179 goto onError;
3180 }
3181 }
3182 else {
3183 p++;
3184 s += Py_UNICODE_SIZE;
3185 }
3186 }
3187
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003188 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003189 goto onError;
3190 Py_XDECREF(errorHandler);
3191 Py_XDECREF(exc);
3192 return (PyObject *)v;
3193
3194 onError:
3195 Py_XDECREF(v);
3196 Py_XDECREF(errorHandler);
3197 Py_XDECREF(exc);
3198 return NULL;
3199}
3200
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201/* --- Latin-1 Codec ------------------------------------------------------ */
3202
3203PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003204 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 const char *errors)
3206{
3207 PyUnicodeObject *v;
3208 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003209
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003211 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003212 Py_UNICODE r = *(unsigned char*)s;
3213 return PyUnicode_FromUnicode(&r, 1);
3214 }
3215
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 v = _PyUnicode_New(size);
3217 if (v == NULL)
3218 goto onError;
3219 if (size == 0)
3220 return (PyObject *)v;
3221 p = PyUnicode_AS_UNICODE(v);
3222 while (size-- > 0)
3223 *p++ = (unsigned char)*s++;
3224 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003225
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 onError:
3227 Py_XDECREF(v);
3228 return NULL;
3229}
3230
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003231/* create or adjust a UnicodeEncodeError */
3232static void make_encode_exception(PyObject **exceptionObject,
3233 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003234 const Py_UNICODE *unicode, Py_ssize_t size,
3235 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238 if (*exceptionObject == NULL) {
3239 *exceptionObject = PyUnicodeEncodeError_Create(
3240 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 }
3242 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003243 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3244 goto onError;
3245 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3246 goto onError;
3247 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3248 goto onError;
3249 return;
3250 onError:
3251 Py_DECREF(*exceptionObject);
3252 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 }
3254}
3255
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256/* raises a UnicodeEncodeError */
3257static void raise_encode_exception(PyObject **exceptionObject,
3258 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003259 const Py_UNICODE *unicode, Py_ssize_t size,
3260 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003261 const char *reason)
3262{
3263 make_encode_exception(exceptionObject,
3264 encoding, unicode, size, startpos, endpos, reason);
3265 if (*exceptionObject != NULL)
3266 PyCodec_StrictErrors(*exceptionObject);
3267}
3268
3269/* error handling callback helper:
3270 build arguments, call the callback and check the arguments,
3271 put the result into newpos and return the replacement string, which
3272 has to be freed by the caller */
3273static PyObject *unicode_encode_call_errorhandler(const char *errors,
3274 PyObject **errorHandler,
3275 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003276 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3277 Py_ssize_t startpos, Py_ssize_t endpos,
3278 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003279{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003280 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281
3282 PyObject *restuple;
3283 PyObject *resunicode;
3284
3285 if (*errorHandler == NULL) {
3286 *errorHandler = PyCodec_LookupError(errors);
3287 if (*errorHandler == NULL)
3288 return NULL;
3289 }
3290
3291 make_encode_exception(exceptionObject,
3292 encoding, unicode, size, startpos, endpos, reason);
3293 if (*exceptionObject == NULL)
3294 return NULL;
3295
3296 restuple = PyObject_CallFunctionObjArgs(
3297 *errorHandler, *exceptionObject, NULL);
3298 if (restuple == NULL)
3299 return NULL;
3300 if (!PyTuple_Check(restuple)) {
3301 PyErr_Format(PyExc_TypeError, &argparse[4]);
3302 Py_DECREF(restuple);
3303 return NULL;
3304 }
3305 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3306 &resunicode, newpos)) {
3307 Py_DECREF(restuple);
3308 return NULL;
3309 }
3310 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003311 *newpos = size+*newpos;
3312 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003313 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003314 Py_DECREF(restuple);
3315 return NULL;
3316 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317 Py_INCREF(resunicode);
3318 Py_DECREF(restuple);
3319 return resunicode;
3320}
3321
3322static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003323 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003324 const char *errors,
3325 int limit)
3326{
3327 /* output object */
3328 PyObject *res;
3329 /* pointers to the beginning and end+1 of input */
3330 const Py_UNICODE *startp = p;
3331 const Py_UNICODE *endp = p + size;
3332 /* pointer to the beginning of the unencodable characters */
3333 /* const Py_UNICODE *badp = NULL; */
3334 /* pointer into the output */
3335 char *str;
3336 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003337 Py_ssize_t respos = 0;
3338 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003339 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3340 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 PyObject *errorHandler = NULL;
3342 PyObject *exc = NULL;
3343 /* the following variable is used for caching string comparisons
3344 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3345 int known_errorHandler = -1;
3346
3347 /* allocate enough for a simple encoding without
3348 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003349 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003350 if (res == NULL)
3351 goto onError;
3352 if (size == 0)
3353 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003354 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355 ressize = size;
3356
3357 while (p<endp) {
3358 Py_UNICODE c = *p;
3359
3360 /* can we encode this? */
3361 if (c<limit) {
3362 /* no overflow check, because we know that the space is enough */
3363 *str++ = (char)c;
3364 ++p;
3365 }
3366 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003367 Py_ssize_t unicodepos = p-startp;
3368 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003369 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003370 Py_ssize_t repsize;
3371 Py_ssize_t newpos;
3372 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003373 Py_UNICODE *uni2;
3374 /* startpos for collecting unencodable chars */
3375 const Py_UNICODE *collstart = p;
3376 const Py_UNICODE *collend = p;
3377 /* find all unecodable characters */
3378 while ((collend < endp) && ((*collend)>=limit))
3379 ++collend;
3380 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3381 if (known_errorHandler==-1) {
3382 if ((errors==NULL) || (!strcmp(errors, "strict")))
3383 known_errorHandler = 1;
3384 else if (!strcmp(errors, "replace"))
3385 known_errorHandler = 2;
3386 else if (!strcmp(errors, "ignore"))
3387 known_errorHandler = 3;
3388 else if (!strcmp(errors, "xmlcharrefreplace"))
3389 known_errorHandler = 4;
3390 else
3391 known_errorHandler = 0;
3392 }
3393 switch (known_errorHandler) {
3394 case 1: /* strict */
3395 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3396 goto onError;
3397 case 2: /* replace */
3398 while (collstart++<collend)
3399 *str++ = '?'; /* fall through */
3400 case 3: /* ignore */
3401 p = collend;
3402 break;
3403 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003404 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003405 /* determine replacement size (temporarily (mis)uses p) */
3406 for (p = collstart, repsize = 0; p < collend; ++p) {
3407 if (*p<10)
3408 repsize += 2+1+1;
3409 else if (*p<100)
3410 repsize += 2+2+1;
3411 else if (*p<1000)
3412 repsize += 2+3+1;
3413 else if (*p<10000)
3414 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003415#ifndef Py_UNICODE_WIDE
3416 else
3417 repsize += 2+5+1;
3418#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 else if (*p<100000)
3420 repsize += 2+5+1;
3421 else if (*p<1000000)
3422 repsize += 2+6+1;
3423 else
3424 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003425#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003426 }
3427 requiredsize = respos+repsize+(endp-collend);
3428 if (requiredsize > ressize) {
3429 if (requiredsize<2*ressize)
3430 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003431 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003432 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003433 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003434 ressize = requiredsize;
3435 }
3436 /* generate replacement (temporarily (mis)uses p) */
3437 for (p = collstart; p < collend; ++p) {
3438 str += sprintf(str, "&#%d;", (int)*p);
3439 }
3440 p = collend;
3441 break;
3442 default:
3443 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3444 encoding, reason, startp, size, &exc,
3445 collstart-startp, collend-startp, &newpos);
3446 if (repunicode == NULL)
3447 goto onError;
3448 /* need more space? (at least enough for what we
3449 have+the replacement+the rest of the string, so
3450 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003451 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003452 repsize = PyUnicode_GET_SIZE(repunicode);
3453 requiredsize = respos+repsize+(endp-collend);
3454 if (requiredsize > ressize) {
3455 if (requiredsize<2*ressize)
3456 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003457 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003458 Py_DECREF(repunicode);
3459 goto onError;
3460 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003461 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462 ressize = requiredsize;
3463 }
3464 /* check if there is anything unencodable in the replacement
3465 and copy it to the output */
3466 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3467 c = *uni2;
3468 if (c >= limit) {
3469 raise_encode_exception(&exc, encoding, startp, size,
3470 unicodepos, unicodepos+1, reason);
3471 Py_DECREF(repunicode);
3472 goto onError;
3473 }
3474 *str = (char)c;
3475 }
3476 p = startp + newpos;
3477 Py_DECREF(repunicode);
3478 }
3479 }
3480 }
3481 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003482 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003483 if (respos<ressize)
3484 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003485 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486 Py_XDECREF(errorHandler);
3487 Py_XDECREF(exc);
3488 return res;
3489
3490 onError:
3491 Py_XDECREF(res);
3492 Py_XDECREF(errorHandler);
3493 Py_XDECREF(exc);
3494 return NULL;
3495}
3496
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003498 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499 const char *errors)
3500{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502}
3503
3504PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3505{
3506 if (!PyUnicode_Check(unicode)) {
3507 PyErr_BadArgument();
3508 return NULL;
3509 }
3510 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3511 PyUnicode_GET_SIZE(unicode),
3512 NULL);
3513}
3514
3515/* --- 7-bit ASCII Codec -------------------------------------------------- */
3516
Guido van Rossumd57fd912000-03-10 22:53:23 +00003517PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003518 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 const char *errors)
3520{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522 PyUnicodeObject *v;
3523 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003524 Py_ssize_t startinpos;
3525 Py_ssize_t endinpos;
3526 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 const char *e;
3528 PyObject *errorHandler = NULL;
3529 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003530
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003532 if (size == 1 && *(unsigned char*)s < 128) {
3533 Py_UNICODE r = *(unsigned char*)s;
3534 return PyUnicode_FromUnicode(&r, 1);
3535 }
Tim Petersced69f82003-09-16 20:30:58 +00003536
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 v = _PyUnicode_New(size);
3538 if (v == NULL)
3539 goto onError;
3540 if (size == 0)
3541 return (PyObject *)v;
3542 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 e = s + size;
3544 while (s < e) {
3545 register unsigned char c = (unsigned char)*s;
3546 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 ++s;
3549 }
3550 else {
3551 startinpos = s-starts;
3552 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003553 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 if (unicode_decode_call_errorhandler(
3555 errors, &errorHandler,
3556 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003557 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003562 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003563 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003564 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 Py_XDECREF(errorHandler);
3566 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003568
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569 onError:
3570 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571 Py_XDECREF(errorHandler);
3572 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573 return NULL;
3574}
3575
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003577 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 const char *errors)
3579{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581}
3582
3583PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3584{
3585 if (!PyUnicode_Check(unicode)) {
3586 PyErr_BadArgument();
3587 return NULL;
3588 }
3589 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3590 PyUnicode_GET_SIZE(unicode),
3591 NULL);
3592}
3593
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003594#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003595
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003596/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003597
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003598#if SIZEOF_INT < SIZEOF_SSIZE_T
3599#define NEED_RETRY
3600#endif
3601
3602/* XXX This code is limited to "true" double-byte encodings, as
3603 a) it assumes an incomplete character consists of a single byte, and
3604 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3605 encodings, see IsDBCSLeadByteEx documentation. */
3606
3607static int is_dbcs_lead_byte(const char *s, int offset)
3608{
3609 const char *curr = s + offset;
3610
3611 if (IsDBCSLeadByte(*curr)) {
3612 const char *prev = CharPrev(s, curr);
3613 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3614 }
3615 return 0;
3616}
3617
3618/*
3619 * Decode MBCS string into unicode object. If 'final' is set, converts
3620 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3621 */
3622static int decode_mbcs(PyUnicodeObject **v,
3623 const char *s, /* MBCS string */
3624 int size, /* sizeof MBCS string */
3625 int final)
3626{
3627 Py_UNICODE *p;
3628 Py_ssize_t n = 0;
3629 int usize = 0;
3630
3631 assert(size >= 0);
3632
3633 /* Skip trailing lead-byte unless 'final' is set */
3634 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3635 --size;
3636
3637 /* First get the size of the result */
3638 if (size > 0) {
3639 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3640 if (usize == 0) {
3641 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3642 return -1;
3643 }
3644 }
3645
3646 if (*v == NULL) {
3647 /* Create unicode object */
3648 *v = _PyUnicode_New(usize);
3649 if (*v == NULL)
3650 return -1;
3651 }
3652 else {
3653 /* Extend unicode object */
3654 n = PyUnicode_GET_SIZE(*v);
3655 if (_PyUnicode_Resize(v, n + usize) < 0)
3656 return -1;
3657 }
3658
3659 /* Do the conversion */
3660 if (size > 0) {
3661 p = PyUnicode_AS_UNICODE(*v) + n;
3662 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3663 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3664 return -1;
3665 }
3666 }
3667
3668 return size;
3669}
3670
3671PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3672 Py_ssize_t size,
3673 const char *errors,
3674 Py_ssize_t *consumed)
3675{
3676 PyUnicodeObject *v = NULL;
3677 int done;
3678
3679 if (consumed)
3680 *consumed = 0;
3681
3682#ifdef NEED_RETRY
3683 retry:
3684 if (size > INT_MAX)
3685 done = decode_mbcs(&v, s, INT_MAX, 0);
3686 else
3687#endif
3688 done = decode_mbcs(&v, s, (int)size, !consumed);
3689
3690 if (done < 0) {
3691 Py_XDECREF(v);
3692 return NULL;
3693 }
3694
3695 if (consumed)
3696 *consumed += done;
3697
3698#ifdef NEED_RETRY
3699 if (size > INT_MAX) {
3700 s += done;
3701 size -= done;
3702 goto retry;
3703 }
3704#endif
3705
3706 return (PyObject *)v;
3707}
3708
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003709PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003710 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003711 const char *errors)
3712{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003713 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3714}
3715
3716/*
3717 * Convert unicode into string object (MBCS).
3718 * Returns 0 if succeed, -1 otherwise.
3719 */
3720static int encode_mbcs(PyObject **repr,
3721 const Py_UNICODE *p, /* unicode */
3722 int size) /* size of unicode */
3723{
3724 int mbcssize = 0;
3725 Py_ssize_t n = 0;
3726
3727 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003728
3729 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003730 if (size > 0) {
3731 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3732 if (mbcssize == 0) {
3733 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3734 return -1;
3735 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003736 }
3737
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003738 if (*repr == NULL) {
3739 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003740 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003741 if (*repr == NULL)
3742 return -1;
3743 }
3744 else {
3745 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003746 n = PyBytes_Size(*repr);
3747 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003748 return -1;
3749 }
3750
3751 /* Do the conversion */
3752 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003753 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003754 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3755 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3756 return -1;
3757 }
3758 }
3759
3760 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003761}
3762
3763PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003764 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003765 const char *errors)
3766{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003767 PyObject *repr = NULL;
3768 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003769
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003770#ifdef NEED_RETRY
3771 retry:
3772 if (size > INT_MAX)
3773 ret = encode_mbcs(&repr, p, INT_MAX);
3774 else
3775#endif
3776 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003777
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003778 if (ret < 0) {
3779 Py_XDECREF(repr);
3780 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003781 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003782
3783#ifdef NEED_RETRY
3784 if (size > INT_MAX) {
3785 p += INT_MAX;
3786 size -= INT_MAX;
3787 goto retry;
3788 }
3789#endif
3790
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003791 return repr;
3792}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003793
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003794PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3795{
3796 if (!PyUnicode_Check(unicode)) {
3797 PyErr_BadArgument();
3798 return NULL;
3799 }
3800 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3801 PyUnicode_GET_SIZE(unicode),
3802 NULL);
3803}
3804
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003805#undef NEED_RETRY
3806
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003807#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003808
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809/* --- Character Mapping Codec -------------------------------------------- */
3810
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003812 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 PyObject *mapping,
3814 const char *errors)
3815{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003817 Py_ssize_t startinpos;
3818 Py_ssize_t endinpos;
3819 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 PyUnicodeObject *v;
3822 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003823 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003824 PyObject *errorHandler = NULL;
3825 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003826 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003827 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003828
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 /* Default to Latin-1 */
3830 if (mapping == NULL)
3831 return PyUnicode_DecodeLatin1(s, size, errors);
3832
3833 v = _PyUnicode_New(size);
3834 if (v == NULL)
3835 goto onError;
3836 if (size == 0)
3837 return (PyObject *)v;
3838 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003839 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003840 if (PyUnicode_CheckExact(mapping)) {
3841 mapstring = PyUnicode_AS_UNICODE(mapping);
3842 maplen = PyUnicode_GET_SIZE(mapping);
3843 while (s < e) {
3844 unsigned char ch = *s;
3845 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003847 if (ch < maplen)
3848 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003850 if (x == 0xfffe) {
3851 /* undefined mapping */
3852 outpos = p-PyUnicode_AS_UNICODE(v);
3853 startinpos = s-starts;
3854 endinpos = startinpos+1;
3855 if (unicode_decode_call_errorhandler(
3856 errors, &errorHandler,
3857 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003858 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003859 (PyObject **)&v, &outpos, &p)) {
3860 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003861 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003862 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003863 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003864 *p++ = x;
3865 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003867 }
3868 else {
3869 while (s < e) {
3870 unsigned char ch = *s;
3871 PyObject *w, *x;
3872
3873 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3874 w = PyInt_FromLong((long)ch);
3875 if (w == NULL)
3876 goto onError;
3877 x = PyObject_GetItem(mapping, w);
3878 Py_DECREF(w);
3879 if (x == NULL) {
3880 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3881 /* No mapping found means: mapping is undefined. */
3882 PyErr_Clear();
3883 x = Py_None;
3884 Py_INCREF(x);
3885 } else
3886 goto onError;
3887 }
3888
3889 /* Apply mapping */
3890 if (PyInt_Check(x)) {
3891 long value = PyInt_AS_LONG(x);
3892 if (value < 0 || value > 65535) {
3893 PyErr_SetString(PyExc_TypeError,
3894 "character mapping must be in range(65536)");
3895 Py_DECREF(x);
3896 goto onError;
3897 }
3898 *p++ = (Py_UNICODE)value;
3899 }
3900 else if (x == Py_None) {
3901 /* undefined mapping */
3902 outpos = p-PyUnicode_AS_UNICODE(v);
3903 startinpos = s-starts;
3904 endinpos = startinpos+1;
3905 if (unicode_decode_call_errorhandler(
3906 errors, &errorHandler,
3907 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003908 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003909 (PyObject **)&v, &outpos, &p)) {
3910 Py_DECREF(x);
3911 goto onError;
3912 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003913 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003914 continue;
3915 }
3916 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003917 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003918
3919 if (targetsize == 1)
3920 /* 1-1 mapping */
3921 *p++ = *PyUnicode_AS_UNICODE(x);
3922
3923 else if (targetsize > 1) {
3924 /* 1-n mapping */
3925 if (targetsize > extrachars) {
3926 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003927 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3928 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003929 (targetsize << 2);
3930 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003931 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003932 if (_PyUnicode_Resize(&v,
3933 PyUnicode_GET_SIZE(v) + needed) < 0) {
3934 Py_DECREF(x);
3935 goto onError;
3936 }
3937 p = PyUnicode_AS_UNICODE(v) + oldpos;
3938 }
3939 Py_UNICODE_COPY(p,
3940 PyUnicode_AS_UNICODE(x),
3941 targetsize);
3942 p += targetsize;
3943 extrachars -= targetsize;
3944 }
3945 /* 1-0 mapping: skip the character */
3946 }
3947 else {
3948 /* wrong return value */
3949 PyErr_SetString(PyExc_TypeError,
3950 "character mapping must return integer, None or unicode");
3951 Py_DECREF(x);
3952 goto onError;
3953 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003955 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 }
3958 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003959 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 Py_XDECREF(errorHandler);
3962 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003964
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 Py_XDECREF(errorHandler);
3967 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 Py_XDECREF(v);
3969 return NULL;
3970}
3971
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003972/* Charmap encoding: the lookup table */
3973
3974struct encoding_map{
3975 PyObject_HEAD
3976 unsigned char level1[32];
3977 int count2, count3;
3978 unsigned char level23[1];
3979};
3980
3981static PyObject*
3982encoding_map_size(PyObject *obj, PyObject* args)
3983{
3984 struct encoding_map *map = (struct encoding_map*)obj;
3985 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3986 128*map->count3);
3987}
3988
3989static PyMethodDef encoding_map_methods[] = {
3990 {"size", encoding_map_size, METH_NOARGS,
3991 PyDoc_STR("Return the size (in bytes) of this object") },
3992 { 0 }
3993};
3994
3995static void
3996encoding_map_dealloc(PyObject* o)
3997{
3998 PyObject_FREE(o);
3999}
4000
4001static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004002 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004003 "EncodingMap", /*tp_name*/
4004 sizeof(struct encoding_map), /*tp_basicsize*/
4005 0, /*tp_itemsize*/
4006 /* methods */
4007 encoding_map_dealloc, /*tp_dealloc*/
4008 0, /*tp_print*/
4009 0, /*tp_getattr*/
4010 0, /*tp_setattr*/
4011 0, /*tp_compare*/
4012 0, /*tp_repr*/
4013 0, /*tp_as_number*/
4014 0, /*tp_as_sequence*/
4015 0, /*tp_as_mapping*/
4016 0, /*tp_hash*/
4017 0, /*tp_call*/
4018 0, /*tp_str*/
4019 0, /*tp_getattro*/
4020 0, /*tp_setattro*/
4021 0, /*tp_as_buffer*/
4022 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4023 0, /*tp_doc*/
4024 0, /*tp_traverse*/
4025 0, /*tp_clear*/
4026 0, /*tp_richcompare*/
4027 0, /*tp_weaklistoffset*/
4028 0, /*tp_iter*/
4029 0, /*tp_iternext*/
4030 encoding_map_methods, /*tp_methods*/
4031 0, /*tp_members*/
4032 0, /*tp_getset*/
4033 0, /*tp_base*/
4034 0, /*tp_dict*/
4035 0, /*tp_descr_get*/
4036 0, /*tp_descr_set*/
4037 0, /*tp_dictoffset*/
4038 0, /*tp_init*/
4039 0, /*tp_alloc*/
4040 0, /*tp_new*/
4041 0, /*tp_free*/
4042 0, /*tp_is_gc*/
4043};
4044
4045PyObject*
4046PyUnicode_BuildEncodingMap(PyObject* string)
4047{
4048 Py_UNICODE *decode;
4049 PyObject *result;
4050 struct encoding_map *mresult;
4051 int i;
4052 int need_dict = 0;
4053 unsigned char level1[32];
4054 unsigned char level2[512];
4055 unsigned char *mlevel1, *mlevel2, *mlevel3;
4056 int count2 = 0, count3 = 0;
4057
4058 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4059 PyErr_BadArgument();
4060 return NULL;
4061 }
4062 decode = PyUnicode_AS_UNICODE(string);
4063 memset(level1, 0xFF, sizeof level1);
4064 memset(level2, 0xFF, sizeof level2);
4065
4066 /* If there isn't a one-to-one mapping of NULL to \0,
4067 or if there are non-BMP characters, we need to use
4068 a mapping dictionary. */
4069 if (decode[0] != 0)
4070 need_dict = 1;
4071 for (i = 1; i < 256; i++) {
4072 int l1, l2;
4073 if (decode[i] == 0
4074 #ifdef Py_UNICODE_WIDE
4075 || decode[i] > 0xFFFF
4076 #endif
4077 ) {
4078 need_dict = 1;
4079 break;
4080 }
4081 if (decode[i] == 0xFFFE)
4082 /* unmapped character */
4083 continue;
4084 l1 = decode[i] >> 11;
4085 l2 = decode[i] >> 7;
4086 if (level1[l1] == 0xFF)
4087 level1[l1] = count2++;
4088 if (level2[l2] == 0xFF)
4089 level2[l2] = count3++;
4090 }
4091
4092 if (count2 >= 0xFF || count3 >= 0xFF)
4093 need_dict = 1;
4094
4095 if (need_dict) {
4096 PyObject *result = PyDict_New();
4097 PyObject *key, *value;
4098 if (!result)
4099 return NULL;
4100 for (i = 0; i < 256; i++) {
4101 key = value = NULL;
4102 key = PyInt_FromLong(decode[i]);
4103 value = PyInt_FromLong(i);
4104 if (!key || !value)
4105 goto failed1;
4106 if (PyDict_SetItem(result, key, value) == -1)
4107 goto failed1;
4108 Py_DECREF(key);
4109 Py_DECREF(value);
4110 }
4111 return result;
4112 failed1:
4113 Py_XDECREF(key);
4114 Py_XDECREF(value);
4115 Py_DECREF(result);
4116 return NULL;
4117 }
4118
4119 /* Create a three-level trie */
4120 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4121 16*count2 + 128*count3 - 1);
4122 if (!result)
4123 return PyErr_NoMemory();
4124 PyObject_Init(result, &EncodingMapType);
4125 mresult = (struct encoding_map*)result;
4126 mresult->count2 = count2;
4127 mresult->count3 = count3;
4128 mlevel1 = mresult->level1;
4129 mlevel2 = mresult->level23;
4130 mlevel3 = mresult->level23 + 16*count2;
4131 memcpy(mlevel1, level1, 32);
4132 memset(mlevel2, 0xFF, 16*count2);
4133 memset(mlevel3, 0, 128*count3);
4134 count3 = 0;
4135 for (i = 1; i < 256; i++) {
4136 int o1, o2, o3, i2, i3;
4137 if (decode[i] == 0xFFFE)
4138 /* unmapped character */
4139 continue;
4140 o1 = decode[i]>>11;
4141 o2 = (decode[i]>>7) & 0xF;
4142 i2 = 16*mlevel1[o1] + o2;
4143 if (mlevel2[i2] == 0xFF)
4144 mlevel2[i2] = count3++;
4145 o3 = decode[i] & 0x7F;
4146 i3 = 128*mlevel2[i2] + o3;
4147 mlevel3[i3] = i;
4148 }
4149 return result;
4150}
4151
4152static int
4153encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4154{
4155 struct encoding_map *map = (struct encoding_map*)mapping;
4156 int l1 = c>>11;
4157 int l2 = (c>>7) & 0xF;
4158 int l3 = c & 0x7F;
4159 int i;
4160
4161#ifdef Py_UNICODE_WIDE
4162 if (c > 0xFFFF) {
4163 return -1;
4164 }
4165#endif
4166 if (c == 0)
4167 return 0;
4168 /* level 1*/
4169 i = map->level1[l1];
4170 if (i == 0xFF) {
4171 return -1;
4172 }
4173 /* level 2*/
4174 i = map->level23[16*i+l2];
4175 if (i == 0xFF) {
4176 return -1;
4177 }
4178 /* level 3 */
4179 i = map->level23[16*map->count2 + 128*i + l3];
4180 if (i == 0) {
4181 return -1;
4182 }
4183 return i;
4184}
4185
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186/* Lookup the character ch in the mapping. If the character
4187 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004188 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 PyObject *w = PyInt_FromLong((long)c);
4192 PyObject *x;
4193
4194 if (w == NULL)
4195 return NULL;
4196 x = PyObject_GetItem(mapping, w);
4197 Py_DECREF(w);
4198 if (x == NULL) {
4199 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4200 /* No mapping found means: mapping is undefined. */
4201 PyErr_Clear();
4202 x = Py_None;
4203 Py_INCREF(x);
4204 return x;
4205 } else
4206 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004208 else if (x == Py_None)
4209 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210 else if (PyInt_Check(x)) {
4211 long value = PyInt_AS_LONG(x);
4212 if (value < 0 || value > 255) {
4213 PyErr_SetString(PyExc_TypeError,
4214 "character mapping must be in range(256)");
4215 Py_DECREF(x);
4216 return NULL;
4217 }
4218 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004219 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004220 else if (PyString_Check(x))
4221 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004224 PyErr_Format(PyExc_TypeError,
4225 "character mapping must return integer, None or str8, not %.400s",
4226 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004227 Py_DECREF(x);
4228 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 }
4230}
4231
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004232static int
Walter Dörwald827b0552007-05-12 13:23:53 +00004233charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004234{
Walter Dörwald827b0552007-05-12 13:23:53 +00004235 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004236 /* exponentially overallocate to minimize reallocations */
4237 if (requiredsize < 2*outsize)
4238 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00004239 if (PyBytes_Resize(outobj, requiredsize)) {
4240 Py_DECREF(outobj);
4241 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004242 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004243 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004244}
4245
4246typedef enum charmapencode_result {
4247 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4248}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004249/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004250 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 space is available. Return a new reference to the object that
4252 was put in the output buffer, or Py_None, if the mapping was undefined
4253 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004254 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004256charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00004257 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004259 PyObject *rep;
4260 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00004261 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004263 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004264 int res = encoding_map_lookup(c, mapping);
4265 Py_ssize_t requiredsize = *outpos+1;
4266 if (res == -1)
4267 return enc_FAILED;
4268 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004269 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004270 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004271 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004272 outstart[(*outpos)++] = (char)res;
4273 return enc_SUCCESS;
4274 }
4275
4276 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004278 return enc_EXCEPTION;
4279 else if (rep==Py_None) {
4280 Py_DECREF(rep);
4281 return enc_FAILED;
4282 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004284 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004285 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004286 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004288 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004290 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4292 }
4293 else {
4294 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004295 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4296 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004297 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004298 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004299 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004300 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004302 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 memcpy(outstart + *outpos, repchars, repsize);
4304 *outpos += repsize;
4305 }
4306 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004307 Py_DECREF(rep);
4308 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004309}
4310
4311/* handle an error in PyUnicode_EncodeCharmap
4312 Return 0 on success, -1 on error */
4313static
4314int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004315 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004317 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004318 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319{
4320 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004321 Py_ssize_t repsize;
4322 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004323 Py_UNICODE *uni2;
4324 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004325 Py_ssize_t collstartpos = *inpos;
4326 Py_ssize_t collendpos = *inpos+1;
4327 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 char *encoding = "charmap";
4329 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004330 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332 /* find all unencodable characters */
4333 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004334 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004335 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004336 int res = encoding_map_lookup(p[collendpos], mapping);
4337 if (res != -1)
4338 break;
4339 ++collendpos;
4340 continue;
4341 }
4342
4343 rep = charmapencode_lookup(p[collendpos], mapping);
4344 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004345 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004346 else if (rep!=Py_None) {
4347 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348 break;
4349 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004350 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004351 ++collendpos;
4352 }
4353 /* cache callback name lookup
4354 * (if not done yet, i.e. it's the first error) */
4355 if (*known_errorHandler==-1) {
4356 if ((errors==NULL) || (!strcmp(errors, "strict")))
4357 *known_errorHandler = 1;
4358 else if (!strcmp(errors, "replace"))
4359 *known_errorHandler = 2;
4360 else if (!strcmp(errors, "ignore"))
4361 *known_errorHandler = 3;
4362 else if (!strcmp(errors, "xmlcharrefreplace"))
4363 *known_errorHandler = 4;
4364 else
4365 *known_errorHandler = 0;
4366 }
4367 switch (*known_errorHandler) {
4368 case 1: /* strict */
4369 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4370 return -1;
4371 case 2: /* replace */
4372 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4373 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004374 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375 return -1;
4376 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004377 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4379 return -1;
4380 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 }
4382 /* fall through */
4383 case 3: /* ignore */
4384 *inpos = collendpos;
4385 break;
4386 case 4: /* xmlcharrefreplace */
4387 /* generate replacement (temporarily (mis)uses p) */
4388 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4389 char buffer[2+29+1+1];
4390 char *cp;
4391 sprintf(buffer, "&#%d;", (int)p[collpos]);
4392 for (cp = buffer; *cp; ++cp) {
4393 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004394 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004396 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4398 return -1;
4399 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 }
4401 }
4402 *inpos = collendpos;
4403 break;
4404 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004405 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 encoding, reason, p, size, exceptionObject,
4407 collstartpos, collendpos, &newpos);
4408 if (repunicode == NULL)
4409 return -1;
4410 /* generate replacement */
4411 repsize = PyUnicode_GET_SIZE(repunicode);
4412 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4413 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004414 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415 return -1;
4416 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004417 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4420 return -1;
4421 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422 }
4423 *inpos = newpos;
4424 Py_DECREF(repunicode);
4425 }
4426 return 0;
4427}
4428
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004430 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 PyObject *mapping,
4432 const char *errors)
4433{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 /* output object */
4435 PyObject *res = NULL;
4436 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004437 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004439 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 PyObject *errorHandler = NULL;
4441 PyObject *exc = NULL;
4442 /* the following variable is used for caching string comparisons
4443 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4444 * 3=ignore, 4=xmlcharrefreplace */
4445 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446
4447 /* Default to Latin-1 */
4448 if (mapping == NULL)
4449 return PyUnicode_EncodeLatin1(p, size, errors);
4450
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 /* allocate enough for a simple encoding without
4452 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004453 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 if (res == NULL)
4455 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004456 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 while (inpos<size) {
4460 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004461 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004462 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004464 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 if (charmap_encoding_error(p, size, &inpos, mapping,
4466 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004467 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004468 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004469 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 else
4473 /* done with this character => adjust input position */
4474 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004478 if (respos<PyBytes_GET_SIZE(res)) {
4479 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480 goto onError;
4481 }
4482 Py_XDECREF(exc);
4483 Py_XDECREF(errorHandler);
4484 return res;
4485
4486 onError:
4487 Py_XDECREF(res);
4488 Py_XDECREF(exc);
4489 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 return NULL;
4491}
4492
4493PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4494 PyObject *mapping)
4495{
4496 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4497 PyErr_BadArgument();
4498 return NULL;
4499 }
4500 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4501 PyUnicode_GET_SIZE(unicode),
4502 mapping,
4503 NULL);
4504}
4505
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506/* create or adjust a UnicodeTranslateError */
4507static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004508 const Py_UNICODE *unicode, Py_ssize_t size,
4509 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 if (*exceptionObject == NULL) {
4513 *exceptionObject = PyUnicodeTranslateError_Create(
4514 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515 }
4516 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4518 goto onError;
4519 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4520 goto onError;
4521 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4522 goto onError;
4523 return;
4524 onError:
4525 Py_DECREF(*exceptionObject);
4526 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527 }
4528}
4529
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530/* raises a UnicodeTranslateError */
4531static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004532 const Py_UNICODE *unicode, Py_ssize_t size,
4533 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 const char *reason)
4535{
4536 make_translate_exception(exceptionObject,
4537 unicode, size, startpos, endpos, reason);
4538 if (*exceptionObject != NULL)
4539 PyCodec_StrictErrors(*exceptionObject);
4540}
4541
4542/* error handling callback helper:
4543 build arguments, call the callback and check the arguments,
4544 put the result into newpos and return the replacement string, which
4545 has to be freed by the caller */
4546static PyObject *unicode_translate_call_errorhandler(const char *errors,
4547 PyObject **errorHandler,
4548 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004549 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4550 Py_ssize_t startpos, Py_ssize_t endpos,
4551 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004553 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004555 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004556 PyObject *restuple;
4557 PyObject *resunicode;
4558
4559 if (*errorHandler == NULL) {
4560 *errorHandler = PyCodec_LookupError(errors);
4561 if (*errorHandler == NULL)
4562 return NULL;
4563 }
4564
4565 make_translate_exception(exceptionObject,
4566 unicode, size, startpos, endpos, reason);
4567 if (*exceptionObject == NULL)
4568 return NULL;
4569
4570 restuple = PyObject_CallFunctionObjArgs(
4571 *errorHandler, *exceptionObject, NULL);
4572 if (restuple == NULL)
4573 return NULL;
4574 if (!PyTuple_Check(restuple)) {
4575 PyErr_Format(PyExc_TypeError, &argparse[4]);
4576 Py_DECREF(restuple);
4577 return NULL;
4578 }
4579 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004580 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 Py_DECREF(restuple);
4582 return NULL;
4583 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004584 if (i_newpos<0)
4585 *newpos = size+i_newpos;
4586 else
4587 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004588 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004589 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004590 Py_DECREF(restuple);
4591 return NULL;
4592 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593 Py_INCREF(resunicode);
4594 Py_DECREF(restuple);
4595 return resunicode;
4596}
4597
4598/* Lookup the character ch in the mapping and put the result in result,
4599 which must be decrefed by the caller.
4600 Return 0 on success, -1 on error */
4601static
4602int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4603{
4604 PyObject *w = PyInt_FromLong((long)c);
4605 PyObject *x;
4606
4607 if (w == NULL)
4608 return -1;
4609 x = PyObject_GetItem(mapping, w);
4610 Py_DECREF(w);
4611 if (x == NULL) {
4612 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4613 /* No mapping found means: use 1:1 mapping. */
4614 PyErr_Clear();
4615 *result = NULL;
4616 return 0;
4617 } else
4618 return -1;
4619 }
4620 else if (x == Py_None) {
4621 *result = x;
4622 return 0;
4623 }
4624 else if (PyInt_Check(x)) {
4625 long value = PyInt_AS_LONG(x);
4626 long max = PyUnicode_GetMax();
4627 if (value < 0 || value > max) {
4628 PyErr_Format(PyExc_TypeError,
4629 "character mapping must be in range(0x%lx)", max+1);
4630 Py_DECREF(x);
4631 return -1;
4632 }
4633 *result = x;
4634 return 0;
4635 }
4636 else if (PyUnicode_Check(x)) {
4637 *result = x;
4638 return 0;
4639 }
4640 else {
4641 /* wrong return value */
4642 PyErr_SetString(PyExc_TypeError,
4643 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004644 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 return -1;
4646 }
4647}
4648/* ensure that *outobj is at least requiredsize characters long,
4649if not reallocate and adjust various state variables.
4650Return 0 on success, -1 on error */
4651static
Walter Dörwald4894c302003-10-24 14:25:28 +00004652int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004653 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004654{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004655 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004656 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004658 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004660 if (requiredsize < 2 * oldsize)
4661 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004662 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663 return -1;
4664 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004665 }
4666 return 0;
4667}
4668/* lookup the character, put the result in the output string and adjust
4669 various state variables. Return a new reference to the object that
4670 was put in the output buffer in *result, or Py_None, if the mapping was
4671 undefined (in which case no character was written).
4672 The called must decref result.
4673 Return 0 on success, -1 on error. */
4674static
Walter Dörwald4894c302003-10-24 14:25:28 +00004675int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004676 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004677 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678{
Walter Dörwald4894c302003-10-24 14:25:28 +00004679 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 return -1;
4681 if (*res==NULL) {
4682 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004683 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004684 }
4685 else if (*res==Py_None)
4686 ;
4687 else if (PyInt_Check(*res)) {
4688 /* no overflow check, because we know that the space is enough */
4689 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4690 }
4691 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004692 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 if (repsize==1) {
4694 /* no overflow check, because we know that the space is enough */
4695 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4696 }
4697 else if (repsize!=0) {
4698 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004699 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004700 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004701 repsize - 1;
4702 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 return -1;
4704 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4705 *outp += repsize;
4706 }
4707 }
4708 else
4709 return -1;
4710 return 0;
4711}
4712
4713PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004714 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715 PyObject *mapping,
4716 const char *errors)
4717{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 /* output object */
4719 PyObject *res = NULL;
4720 /* pointers to the beginning and end+1 of input */
4721 const Py_UNICODE *startp = p;
4722 const Py_UNICODE *endp = p + size;
4723 /* pointer into the output */
4724 Py_UNICODE *str;
4725 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004726 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004727 char *reason = "character maps to <undefined>";
4728 PyObject *errorHandler = NULL;
4729 PyObject *exc = NULL;
4730 /* the following variable is used for caching string comparisons
4731 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4732 * 3=ignore, 4=xmlcharrefreplace */
4733 int known_errorHandler = -1;
4734
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 if (mapping == NULL) {
4736 PyErr_BadArgument();
4737 return NULL;
4738 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739
4740 /* allocate enough for a simple 1:1 translation without
4741 replacements, if we need more, we'll resize */
4742 res = PyUnicode_FromUnicode(NULL, size);
4743 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004744 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004746 return res;
4747 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004749 while (p<endp) {
4750 /* try to encode it */
4751 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004752 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004753 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 goto onError;
4755 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004756 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004757 if (x!=Py_None) /* it worked => adjust input pointer */
4758 ++p;
4759 else { /* untranslatable character */
4760 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004761 Py_ssize_t repsize;
4762 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004763 Py_UNICODE *uni2;
4764 /* startpos for collecting untranslatable chars */
4765 const Py_UNICODE *collstart = p;
4766 const Py_UNICODE *collend = p+1;
4767 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 /* find all untranslatable characters */
4770 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004771 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772 goto onError;
4773 Py_XDECREF(x);
4774 if (x!=Py_None)
4775 break;
4776 ++collend;
4777 }
4778 /* cache callback name lookup
4779 * (if not done yet, i.e. it's the first error) */
4780 if (known_errorHandler==-1) {
4781 if ((errors==NULL) || (!strcmp(errors, "strict")))
4782 known_errorHandler = 1;
4783 else if (!strcmp(errors, "replace"))
4784 known_errorHandler = 2;
4785 else if (!strcmp(errors, "ignore"))
4786 known_errorHandler = 3;
4787 else if (!strcmp(errors, "xmlcharrefreplace"))
4788 known_errorHandler = 4;
4789 else
4790 known_errorHandler = 0;
4791 }
4792 switch (known_errorHandler) {
4793 case 1: /* strict */
4794 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4795 goto onError;
4796 case 2: /* replace */
4797 /* No need to check for space, this is a 1:1 replacement */
4798 for (coll = collstart; coll<collend; ++coll)
4799 *str++ = '?';
4800 /* fall through */
4801 case 3: /* ignore */
4802 p = collend;
4803 break;
4804 case 4: /* xmlcharrefreplace */
4805 /* generate replacement (temporarily (mis)uses p) */
4806 for (p = collstart; p < collend; ++p) {
4807 char buffer[2+29+1+1];
4808 char *cp;
4809 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004810 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4812 goto onError;
4813 for (cp = buffer; *cp; ++cp)
4814 *str++ = *cp;
4815 }
4816 p = collend;
4817 break;
4818 default:
4819 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4820 reason, startp, size, &exc,
4821 collstart-startp, collend-startp, &newpos);
4822 if (repunicode == NULL)
4823 goto onError;
4824 /* generate replacement */
4825 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004826 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004827 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4828 Py_DECREF(repunicode);
4829 goto onError;
4830 }
4831 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4832 *str++ = *uni2;
4833 p = startp + newpos;
4834 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835 }
4836 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004838 /* Resize if we allocated to much */
4839 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004840 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004841 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004842 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843 }
4844 Py_XDECREF(exc);
4845 Py_XDECREF(errorHandler);
4846 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 onError:
4849 Py_XDECREF(res);
4850 Py_XDECREF(exc);
4851 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 return NULL;
4853}
4854
4855PyObject *PyUnicode_Translate(PyObject *str,
4856 PyObject *mapping,
4857 const char *errors)
4858{
4859 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004860
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861 str = PyUnicode_FromObject(str);
4862 if (str == NULL)
4863 goto onError;
4864 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4865 PyUnicode_GET_SIZE(str),
4866 mapping,
4867 errors);
4868 Py_DECREF(str);
4869 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004870
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 onError:
4872 Py_XDECREF(str);
4873 return NULL;
4874}
Tim Petersced69f82003-09-16 20:30:58 +00004875
Guido van Rossum9e896b32000-04-05 20:11:21 +00004876/* --- Decimal Encoder ---------------------------------------------------- */
4877
4878int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004879 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004880 char *output,
4881 const char *errors)
4882{
4883 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004884 PyObject *errorHandler = NULL;
4885 PyObject *exc = NULL;
4886 const char *encoding = "decimal";
4887 const char *reason = "invalid decimal Unicode string";
4888 /* the following variable is used for caching string comparisons
4889 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4890 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004891
4892 if (output == NULL) {
4893 PyErr_BadArgument();
4894 return -1;
4895 }
4896
4897 p = s;
4898 end = s + length;
4899 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004901 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004902 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004903 Py_ssize_t repsize;
4904 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 Py_UNICODE *uni2;
4906 Py_UNICODE *collstart;
4907 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004908
Guido van Rossum9e896b32000-04-05 20:11:21 +00004909 if (Py_UNICODE_ISSPACE(ch)) {
4910 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004911 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004912 continue;
4913 }
4914 decimal = Py_UNICODE_TODECIMAL(ch);
4915 if (decimal >= 0) {
4916 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004917 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004918 continue;
4919 }
Guido van Rossumba477042000-04-06 18:18:10 +00004920 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004921 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004923 continue;
4924 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004925 /* All other characters are considered unencodable */
4926 collstart = p;
4927 collend = p+1;
4928 while (collend < end) {
4929 if ((0 < *collend && *collend < 256) ||
4930 !Py_UNICODE_ISSPACE(*collend) ||
4931 Py_UNICODE_TODECIMAL(*collend))
4932 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004933 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004934 /* cache callback name lookup
4935 * (if not done yet, i.e. it's the first error) */
4936 if (known_errorHandler==-1) {
4937 if ((errors==NULL) || (!strcmp(errors, "strict")))
4938 known_errorHandler = 1;
4939 else if (!strcmp(errors, "replace"))
4940 known_errorHandler = 2;
4941 else if (!strcmp(errors, "ignore"))
4942 known_errorHandler = 3;
4943 else if (!strcmp(errors, "xmlcharrefreplace"))
4944 known_errorHandler = 4;
4945 else
4946 known_errorHandler = 0;
4947 }
4948 switch (known_errorHandler) {
4949 case 1: /* strict */
4950 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4951 goto onError;
4952 case 2: /* replace */
4953 for (p = collstart; p < collend; ++p)
4954 *output++ = '?';
4955 /* fall through */
4956 case 3: /* ignore */
4957 p = collend;
4958 break;
4959 case 4: /* xmlcharrefreplace */
4960 /* generate replacement (temporarily (mis)uses p) */
4961 for (p = collstart; p < collend; ++p)
4962 output += sprintf(output, "&#%d;", (int)*p);
4963 p = collend;
4964 break;
4965 default:
4966 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4967 encoding, reason, s, length, &exc,
4968 collstart-s, collend-s, &newpos);
4969 if (repunicode == NULL)
4970 goto onError;
4971 /* generate replacement */
4972 repsize = PyUnicode_GET_SIZE(repunicode);
4973 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4974 Py_UNICODE ch = *uni2;
4975 if (Py_UNICODE_ISSPACE(ch))
4976 *output++ = ' ';
4977 else {
4978 decimal = Py_UNICODE_TODECIMAL(ch);
4979 if (decimal >= 0)
4980 *output++ = '0' + decimal;
4981 else if (0 < ch && ch < 256)
4982 *output++ = (char)ch;
4983 else {
4984 Py_DECREF(repunicode);
4985 raise_encode_exception(&exc, encoding,
4986 s, length, collstart-s, collend-s, reason);
4987 goto onError;
4988 }
4989 }
4990 }
4991 p = s + newpos;
4992 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004993 }
4994 }
4995 /* 0-terminate the output string */
4996 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004997 Py_XDECREF(exc);
4998 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004999 return 0;
5000
5001 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005002 Py_XDECREF(exc);
5003 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005004 return -1;
5005}
5006
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007/* --- Helpers ------------------------------------------------------------ */
5008
Thomas Wouters477c8d52006-05-27 19:21:47 +00005009#define STRINGLIB_CHAR Py_UNICODE
5010
5011#define STRINGLIB_LEN PyUnicode_GET_SIZE
5012#define STRINGLIB_NEW PyUnicode_FromUnicode
5013#define STRINGLIB_STR PyUnicode_AS_UNICODE
5014
5015Py_LOCAL_INLINE(int)
5016STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005018 if (str[0] != other[0])
5019 return 1;
5020 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005021}
5022
Thomas Wouters477c8d52006-05-27 19:21:47 +00005023#define STRINGLIB_EMPTY unicode_empty
5024
5025#include "stringlib/fastsearch.h"
5026
5027#include "stringlib/count.h"
5028#include "stringlib/find.h"
5029#include "stringlib/partition.h"
5030
5031/* helper macro to fixup start/end slice values */
5032#define FIX_START_END(obj) \
5033 if (start < 0) \
5034 start += (obj)->length; \
5035 if (start < 0) \
5036 start = 0; \
5037 if (end > (obj)->length) \
5038 end = (obj)->length; \
5039 if (end < 0) \
5040 end += (obj)->length; \
5041 if (end < 0) \
5042 end = 0;
5043
Martin v. Löwis18e16552006-02-15 17:27:45 +00005044Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005045 PyObject *substr,
5046 Py_ssize_t start,
5047 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005049 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005050 PyUnicodeObject* str_obj;
5051 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005052
Thomas Wouters477c8d52006-05-27 19:21:47 +00005053 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5054 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005056 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5057 if (!sub_obj) {
5058 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059 return -1;
5060 }
Tim Petersced69f82003-09-16 20:30:58 +00005061
Thomas Wouters477c8d52006-05-27 19:21:47 +00005062 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005063
Thomas Wouters477c8d52006-05-27 19:21:47 +00005064 result = stringlib_count(
5065 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5066 );
5067
5068 Py_DECREF(sub_obj);
5069 Py_DECREF(str_obj);
5070
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071 return result;
5072}
5073
Martin v. Löwis18e16552006-02-15 17:27:45 +00005074Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005075 PyObject *sub,
5076 Py_ssize_t start,
5077 Py_ssize_t end,
5078 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005080 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005081
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005083 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005084 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005085 sub = PyUnicode_FromObject(sub);
5086 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005087 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005088 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 }
Tim Petersced69f82003-09-16 20:30:58 +00005090
Thomas Wouters477c8d52006-05-27 19:21:47 +00005091 if (direction > 0)
5092 result = stringlib_find_slice(
5093 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5094 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5095 start, end
5096 );
5097 else
5098 result = stringlib_rfind_slice(
5099 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5100 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5101 start, end
5102 );
5103
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005105 Py_DECREF(sub);
5106
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 return result;
5108}
5109
Tim Petersced69f82003-09-16 20:30:58 +00005110static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111int tailmatch(PyUnicodeObject *self,
5112 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005113 Py_ssize_t start,
5114 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115 int direction)
5116{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117 if (substring->length == 0)
5118 return 1;
5119
Thomas Wouters477c8d52006-05-27 19:21:47 +00005120 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121
5122 end -= substring->length;
5123 if (end < start)
5124 return 0;
5125
5126 if (direction > 0) {
5127 if (Py_UNICODE_MATCH(self, end, substring))
5128 return 1;
5129 } else {
5130 if (Py_UNICODE_MATCH(self, start, substring))
5131 return 1;
5132 }
5133
5134 return 0;
5135}
5136
Martin v. Löwis18e16552006-02-15 17:27:45 +00005137Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005139 Py_ssize_t start,
5140 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141 int direction)
5142{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005143 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005144
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 str = PyUnicode_FromObject(str);
5146 if (str == NULL)
5147 return -1;
5148 substr = PyUnicode_FromObject(substr);
5149 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005150 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 return -1;
5152 }
Tim Petersced69f82003-09-16 20:30:58 +00005153
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 result = tailmatch((PyUnicodeObject *)str,
5155 (PyUnicodeObject *)substr,
5156 start, end, direction);
5157 Py_DECREF(str);
5158 Py_DECREF(substr);
5159 return result;
5160}
5161
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162/* Apply fixfct filter to the Unicode object self and return a
5163 reference to the modified object */
5164
Tim Petersced69f82003-09-16 20:30:58 +00005165static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166PyObject *fixup(PyUnicodeObject *self,
5167 int (*fixfct)(PyUnicodeObject *s))
5168{
5169
5170 PyUnicodeObject *u;
5171
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005172 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173 if (u == NULL)
5174 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005175
5176 Py_UNICODE_COPY(u->str, self->str, self->length);
5177
Tim Peters7a29bd52001-09-12 03:03:31 +00005178 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 /* fixfct should return TRUE if it modified the buffer. If
5180 FALSE, return a reference to the original buffer instead
5181 (to save space, not time) */
5182 Py_INCREF(self);
5183 Py_DECREF(u);
5184 return (PyObject*) self;
5185 }
5186 return (PyObject*) u;
5187}
5188
Tim Petersced69f82003-09-16 20:30:58 +00005189static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190int fixupper(PyUnicodeObject *self)
5191{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005192 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 Py_UNICODE *s = self->str;
5194 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005195
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 while (len-- > 0) {
5197 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005198
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 ch = Py_UNICODE_TOUPPER(*s);
5200 if (ch != *s) {
5201 status = 1;
5202 *s = ch;
5203 }
5204 s++;
5205 }
5206
5207 return status;
5208}
5209
Tim Petersced69f82003-09-16 20:30:58 +00005210static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211int fixlower(PyUnicodeObject *self)
5212{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005213 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 Py_UNICODE *s = self->str;
5215 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005216
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 while (len-- > 0) {
5218 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005219
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 ch = Py_UNICODE_TOLOWER(*s);
5221 if (ch != *s) {
5222 status = 1;
5223 *s = ch;
5224 }
5225 s++;
5226 }
5227
5228 return status;
5229}
5230
Tim Petersced69f82003-09-16 20:30:58 +00005231static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232int fixswapcase(PyUnicodeObject *self)
5233{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005234 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 Py_UNICODE *s = self->str;
5236 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005237
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238 while (len-- > 0) {
5239 if (Py_UNICODE_ISUPPER(*s)) {
5240 *s = Py_UNICODE_TOLOWER(*s);
5241 status = 1;
5242 } else if (Py_UNICODE_ISLOWER(*s)) {
5243 *s = Py_UNICODE_TOUPPER(*s);
5244 status = 1;
5245 }
5246 s++;
5247 }
5248
5249 return status;
5250}
5251
Tim Petersced69f82003-09-16 20:30:58 +00005252static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253int fixcapitalize(PyUnicodeObject *self)
5254{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005255 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005256 Py_UNICODE *s = self->str;
5257 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005258
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005259 if (len == 0)
5260 return 0;
5261 if (Py_UNICODE_ISLOWER(*s)) {
5262 *s = Py_UNICODE_TOUPPER(*s);
5263 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005265 s++;
5266 while (--len > 0) {
5267 if (Py_UNICODE_ISUPPER(*s)) {
5268 *s = Py_UNICODE_TOLOWER(*s);
5269 status = 1;
5270 }
5271 s++;
5272 }
5273 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274}
5275
5276static
5277int fixtitle(PyUnicodeObject *self)
5278{
5279 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5280 register Py_UNICODE *e;
5281 int previous_is_cased;
5282
5283 /* Shortcut for single character strings */
5284 if (PyUnicode_GET_SIZE(self) == 1) {
5285 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5286 if (*p != ch) {
5287 *p = ch;
5288 return 1;
5289 }
5290 else
5291 return 0;
5292 }
Tim Petersced69f82003-09-16 20:30:58 +00005293
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 e = p + PyUnicode_GET_SIZE(self);
5295 previous_is_cased = 0;
5296 for (; p < e; p++) {
5297 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005298
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 if (previous_is_cased)
5300 *p = Py_UNICODE_TOLOWER(ch);
5301 else
5302 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005303
5304 if (Py_UNICODE_ISLOWER(ch) ||
5305 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 Py_UNICODE_ISTITLE(ch))
5307 previous_is_cased = 1;
5308 else
5309 previous_is_cased = 0;
5310 }
5311 return 1;
5312}
5313
Tim Peters8ce9f162004-08-27 01:49:32 +00005314PyObject *
5315PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316{
Tim Peters8ce9f162004-08-27 01:49:32 +00005317 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005318 const Py_UNICODE blank = ' ';
5319 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005320 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005321 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005322 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5323 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005324 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5325 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005326 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005327 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005328 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329
Tim Peters05eba1f2004-08-27 21:32:02 +00005330 fseq = PySequence_Fast(seq, "");
5331 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005332 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005333 }
5334
Tim Peters91879ab2004-08-27 22:35:44 +00005335 /* Grrrr. A codec may be invoked to convert str objects to
5336 * Unicode, and so it's possible to call back into Python code
5337 * during PyUnicode_FromObject(), and so it's possible for a sick
5338 * codec to change the size of fseq (if seq is a list). Therefore
5339 * we have to keep refetching the size -- can't assume seqlen
5340 * is invariant.
5341 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005342 seqlen = PySequence_Fast_GET_SIZE(fseq);
5343 /* If empty sequence, return u"". */
5344 if (seqlen == 0) {
5345 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5346 goto Done;
5347 }
5348 /* If singleton sequence with an exact Unicode, return that. */
5349 if (seqlen == 1) {
5350 item = PySequence_Fast_GET_ITEM(fseq, 0);
5351 if (PyUnicode_CheckExact(item)) {
5352 Py_INCREF(item);
5353 res = (PyUnicodeObject *)item;
5354 goto Done;
5355 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005356 }
5357
Tim Peters05eba1f2004-08-27 21:32:02 +00005358 /* At least two items to join, or one that isn't exact Unicode. */
5359 if (seqlen > 1) {
5360 /* Set up sep and seplen -- they're needed. */
5361 if (separator == NULL) {
5362 sep = &blank;
5363 seplen = 1;
5364 }
5365 else {
5366 internal_separator = PyUnicode_FromObject(separator);
5367 if (internal_separator == NULL)
5368 goto onError;
5369 sep = PyUnicode_AS_UNICODE(internal_separator);
5370 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005371 /* In case PyUnicode_FromObject() mutated seq. */
5372 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005373 }
5374 }
5375
5376 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005377 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005378 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005379 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005380 res_p = PyUnicode_AS_UNICODE(res);
5381 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005382
Tim Peters05eba1f2004-08-27 21:32:02 +00005383 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005384 Py_ssize_t itemlen;
5385 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005386
5387 item = PySequence_Fast_GET_ITEM(fseq, i);
5388 /* Convert item to Unicode. */
5389 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5390 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005391 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005392 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005393 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005394 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005395 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005396 item = PyUnicode_FromObject(item);
5397 if (item == NULL)
5398 goto onError;
5399 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005400
Tim Peters91879ab2004-08-27 22:35:44 +00005401 /* In case PyUnicode_FromObject() mutated seq. */
5402 seqlen = PySequence_Fast_GET_SIZE(fseq);
5403
Tim Peters8ce9f162004-08-27 01:49:32 +00005404 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005406 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005407 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005408 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005409 if (i < seqlen - 1) {
5410 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005411 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005412 goto Overflow;
5413 }
5414 if (new_res_used > res_alloc) {
5415 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005416 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005417 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005418 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005419 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005420 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005421 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005422 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005424 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005425 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005427
5428 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005429 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005430 res_p += itemlen;
5431 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005432 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005433 res_p += seplen;
5434 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005436 res_used = new_res_used;
5437 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005438
Tim Peters05eba1f2004-08-27 21:32:02 +00005439 /* Shrink res to match the used area; this probably can't fail,
5440 * but it's cheap to check.
5441 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005442 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005443 goto onError;
5444
5445 Done:
5446 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005447 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 return (PyObject *)res;
5449
Tim Peters8ce9f162004-08-27 01:49:32 +00005450 Overflow:
5451 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005452 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005453 Py_DECREF(item);
5454 /* fall through */
5455
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005457 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005458 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005459 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 return NULL;
5461}
5462
Tim Petersced69f82003-09-16 20:30:58 +00005463static
5464PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005465 Py_ssize_t left,
5466 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 Py_UNICODE fill)
5468{
5469 PyUnicodeObject *u;
5470
5471 if (left < 0)
5472 left = 0;
5473 if (right < 0)
5474 right = 0;
5475
Tim Peters7a29bd52001-09-12 03:03:31 +00005476 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 Py_INCREF(self);
5478 return self;
5479 }
5480
5481 u = _PyUnicode_New(left + self->length + right);
5482 if (u) {
5483 if (left)
5484 Py_UNICODE_FILL(u->str, fill, left);
5485 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5486 if (right)
5487 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5488 }
5489
5490 return u;
5491}
5492
5493#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005494 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 if (!str) \
5496 goto onError; \
5497 if (PyList_Append(list, str)) { \
5498 Py_DECREF(str); \
5499 goto onError; \
5500 } \
5501 else \
5502 Py_DECREF(str);
5503
5504static
5505PyObject *split_whitespace(PyUnicodeObject *self,
5506 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005507 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005509 register Py_ssize_t i;
5510 register Py_ssize_t j;
5511 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 PyObject *str;
5513
5514 for (i = j = 0; i < len; ) {
5515 /* find a token */
5516 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5517 i++;
5518 j = i;
5519 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5520 i++;
5521 if (j < i) {
5522 if (maxcount-- <= 0)
5523 break;
5524 SPLIT_APPEND(self->str, j, i);
5525 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5526 i++;
5527 j = i;
5528 }
5529 }
5530 if (j < len) {
5531 SPLIT_APPEND(self->str, j, len);
5532 }
5533 return list;
5534
5535 onError:
5536 Py_DECREF(list);
5537 return NULL;
5538}
5539
5540PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005541 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005543 register Py_ssize_t i;
5544 register Py_ssize_t j;
5545 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 PyObject *list;
5547 PyObject *str;
5548 Py_UNICODE *data;
5549
5550 string = PyUnicode_FromObject(string);
5551 if (string == NULL)
5552 return NULL;
5553 data = PyUnicode_AS_UNICODE(string);
5554 len = PyUnicode_GET_SIZE(string);
5555
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 list = PyList_New(0);
5557 if (!list)
5558 goto onError;
5559
5560 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005561 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005562
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005564 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566
5567 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005568 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 if (i < len) {
5570 if (data[i] == '\r' && i + 1 < len &&
5571 data[i+1] == '\n')
5572 i += 2;
5573 else
5574 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005575 if (keepends)
5576 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 }
Guido van Rossum86662912000-04-11 15:38:46 +00005578 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 j = i;
5580 }
5581 if (j < len) {
5582 SPLIT_APPEND(data, j, len);
5583 }
5584
5585 Py_DECREF(string);
5586 return list;
5587
5588 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005589 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 Py_DECREF(string);
5591 return NULL;
5592}
5593
Tim Petersced69f82003-09-16 20:30:58 +00005594static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595PyObject *split_char(PyUnicodeObject *self,
5596 PyObject *list,
5597 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005598 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005600 register Py_ssize_t i;
5601 register Py_ssize_t j;
5602 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 PyObject *str;
5604
5605 for (i = j = 0; i < len; ) {
5606 if (self->str[i] == ch) {
5607 if (maxcount-- <= 0)
5608 break;
5609 SPLIT_APPEND(self->str, j, i);
5610 i = j = i + 1;
5611 } else
5612 i++;
5613 }
5614 if (j <= len) {
5615 SPLIT_APPEND(self->str, j, len);
5616 }
5617 return list;
5618
5619 onError:
5620 Py_DECREF(list);
5621 return NULL;
5622}
5623
Tim Petersced69f82003-09-16 20:30:58 +00005624static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625PyObject *split_substring(PyUnicodeObject *self,
5626 PyObject *list,
5627 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005628 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005630 register Py_ssize_t i;
5631 register Py_ssize_t j;
5632 Py_ssize_t len = self->length;
5633 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 PyObject *str;
5635
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005636 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637 if (Py_UNICODE_MATCH(self, i, substring)) {
5638 if (maxcount-- <= 0)
5639 break;
5640 SPLIT_APPEND(self->str, j, i);
5641 i = j = i + sublen;
5642 } else
5643 i++;
5644 }
5645 if (j <= len) {
5646 SPLIT_APPEND(self->str, j, len);
5647 }
5648 return list;
5649
5650 onError:
5651 Py_DECREF(list);
5652 return NULL;
5653}
5654
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005655static
5656PyObject *rsplit_whitespace(PyUnicodeObject *self,
5657 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005658 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005659{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005660 register Py_ssize_t i;
5661 register Py_ssize_t j;
5662 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005663 PyObject *str;
5664
5665 for (i = j = len - 1; i >= 0; ) {
5666 /* find a token */
5667 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5668 i--;
5669 j = i;
5670 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5671 i--;
5672 if (j > i) {
5673 if (maxcount-- <= 0)
5674 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005675 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005676 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5677 i--;
5678 j = i;
5679 }
5680 }
5681 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005682 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005683 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005684 if (PyList_Reverse(list) < 0)
5685 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005686 return list;
5687
5688 onError:
5689 Py_DECREF(list);
5690 return NULL;
5691}
5692
5693static
5694PyObject *rsplit_char(PyUnicodeObject *self,
5695 PyObject *list,
5696 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005697 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005698{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005699 register Py_ssize_t i;
5700 register Py_ssize_t j;
5701 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005702 PyObject *str;
5703
5704 for (i = j = len - 1; i >= 0; ) {
5705 if (self->str[i] == ch) {
5706 if (maxcount-- <= 0)
5707 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005708 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005709 j = i = i - 1;
5710 } else
5711 i--;
5712 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005713 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005714 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005715 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005716 if (PyList_Reverse(list) < 0)
5717 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005718 return list;
5719
5720 onError:
5721 Py_DECREF(list);
5722 return NULL;
5723}
5724
5725static
5726PyObject *rsplit_substring(PyUnicodeObject *self,
5727 PyObject *list,
5728 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005729 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005730{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005731 register Py_ssize_t i;
5732 register Py_ssize_t j;
5733 Py_ssize_t len = self->length;
5734 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005735 PyObject *str;
5736
5737 for (i = len - sublen, j = len; i >= 0; ) {
5738 if (Py_UNICODE_MATCH(self, i, substring)) {
5739 if (maxcount-- <= 0)
5740 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005741 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005742 j = i;
5743 i -= sublen;
5744 } else
5745 i--;
5746 }
5747 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005748 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005749 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005750 if (PyList_Reverse(list) < 0)
5751 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005752 return list;
5753
5754 onError:
5755 Py_DECREF(list);
5756 return NULL;
5757}
5758
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759#undef SPLIT_APPEND
5760
5761static
5762PyObject *split(PyUnicodeObject *self,
5763 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005764 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765{
5766 PyObject *list;
5767
5768 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005769 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770
5771 list = PyList_New(0);
5772 if (!list)
5773 return NULL;
5774
5775 if (substring == NULL)
5776 return split_whitespace(self,list,maxcount);
5777
5778 else if (substring->length == 1)
5779 return split_char(self,list,substring->str[0],maxcount);
5780
5781 else if (substring->length == 0) {
5782 Py_DECREF(list);
5783 PyErr_SetString(PyExc_ValueError, "empty separator");
5784 return NULL;
5785 }
5786 else
5787 return split_substring(self,list,substring,maxcount);
5788}
5789
Tim Petersced69f82003-09-16 20:30:58 +00005790static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005791PyObject *rsplit(PyUnicodeObject *self,
5792 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005793 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005794{
5795 PyObject *list;
5796
5797 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005798 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005799
5800 list = PyList_New(0);
5801 if (!list)
5802 return NULL;
5803
5804 if (substring == NULL)
5805 return rsplit_whitespace(self,list,maxcount);
5806
5807 else if (substring->length == 1)
5808 return rsplit_char(self,list,substring->str[0],maxcount);
5809
5810 else if (substring->length == 0) {
5811 Py_DECREF(list);
5812 PyErr_SetString(PyExc_ValueError, "empty separator");
5813 return NULL;
5814 }
5815 else
5816 return rsplit_substring(self,list,substring,maxcount);
5817}
5818
5819static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820PyObject *replace(PyUnicodeObject *self,
5821 PyUnicodeObject *str1,
5822 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005823 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824{
5825 PyUnicodeObject *u;
5826
5827 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005828 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829
Thomas Wouters477c8d52006-05-27 19:21:47 +00005830 if (str1->length == str2->length) {
5831 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005832 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005833 if (str1->length == 1) {
5834 /* replace characters */
5835 Py_UNICODE u1, u2;
5836 if (!findchar(self->str, self->length, str1->str[0]))
5837 goto nothing;
5838 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5839 if (!u)
5840 return NULL;
5841 Py_UNICODE_COPY(u->str, self->str, self->length);
5842 u1 = str1->str[0];
5843 u2 = str2->str[0];
5844 for (i = 0; i < u->length; i++)
5845 if (u->str[i] == u1) {
5846 if (--maxcount < 0)
5847 break;
5848 u->str[i] = u2;
5849 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005851 i = fastsearch(
5852 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005854 if (i < 0)
5855 goto nothing;
5856 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5857 if (!u)
5858 return NULL;
5859 Py_UNICODE_COPY(u->str, self->str, self->length);
5860 while (i <= self->length - str1->length)
5861 if (Py_UNICODE_MATCH(self, i, str1)) {
5862 if (--maxcount < 0)
5863 break;
5864 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5865 i += str1->length;
5866 } else
5867 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005870
5871 Py_ssize_t n, i, j, e;
5872 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 Py_UNICODE *p;
5874
5875 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005876 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 if (n > maxcount)
5878 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005879 if (n == 0)
5880 goto nothing;
5881 /* new_size = self->length + n * (str2->length - str1->length)); */
5882 delta = (str2->length - str1->length);
5883 if (delta == 0) {
5884 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005886 product = n * (str2->length - str1->length);
5887 if ((product / (str2->length - str1->length)) != n) {
5888 PyErr_SetString(PyExc_OverflowError,
5889 "replace string is too long");
5890 return NULL;
5891 }
5892 new_size = self->length + product;
5893 if (new_size < 0) {
5894 PyErr_SetString(PyExc_OverflowError,
5895 "replace string is too long");
5896 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 }
5898 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005899 u = _PyUnicode_New(new_size);
5900 if (!u)
5901 return NULL;
5902 i = 0;
5903 p = u->str;
5904 e = self->length - str1->length;
5905 if (str1->length > 0) {
5906 while (n-- > 0) {
5907 /* look for next match */
5908 j = i;
5909 while (j <= e) {
5910 if (Py_UNICODE_MATCH(self, j, str1))
5911 break;
5912 j++;
5913 }
5914 if (j > i) {
5915 if (j > e)
5916 break;
5917 /* copy unchanged part [i:j] */
5918 Py_UNICODE_COPY(p, self->str+i, j-i);
5919 p += j - i;
5920 }
5921 /* copy substitution string */
5922 if (str2->length > 0) {
5923 Py_UNICODE_COPY(p, str2->str, str2->length);
5924 p += str2->length;
5925 }
5926 i = j + str1->length;
5927 }
5928 if (i < self->length)
5929 /* copy tail [i:] */
5930 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5931 } else {
5932 /* interleave */
5933 while (n > 0) {
5934 Py_UNICODE_COPY(p, str2->str, str2->length);
5935 p += str2->length;
5936 if (--n <= 0)
5937 break;
5938 *p++ = self->str[i++];
5939 }
5940 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5941 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005944
5945nothing:
5946 /* nothing to replace; return original string (when possible) */
5947 if (PyUnicode_CheckExact(self)) {
5948 Py_INCREF(self);
5949 return (PyObject *) self;
5950 }
5951 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952}
5953
5954/* --- Unicode Object Methods --------------------------------------------- */
5955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005956PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957"S.title() -> unicode\n\
5958\n\
5959Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005960characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961
5962static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005963unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 return fixup(self, fixtitle);
5966}
5967
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005968PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969"S.capitalize() -> unicode\n\
5970\n\
5971Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005972have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973
5974static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005975unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 return fixup(self, fixcapitalize);
5978}
5979
5980#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005981PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982"S.capwords() -> unicode\n\
5983\n\
5984Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005985normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986
5987static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005988unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989{
5990 PyObject *list;
5991 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005992 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 /* Split into words */
5995 list = split(self, NULL, -1);
5996 if (!list)
5997 return NULL;
5998
5999 /* Capitalize each word */
6000 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6001 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6002 fixcapitalize);
6003 if (item == NULL)
6004 goto onError;
6005 Py_DECREF(PyList_GET_ITEM(list, i));
6006 PyList_SET_ITEM(list, i, item);
6007 }
6008
6009 /* Join the words to form a new string */
6010 item = PyUnicode_Join(NULL, list);
6011
6012onError:
6013 Py_DECREF(list);
6014 return (PyObject *)item;
6015}
6016#endif
6017
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006018/* Argument converter. Coerces to a single unicode character */
6019
6020static int
6021convert_uc(PyObject *obj, void *addr)
6022{
6023 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6024 PyObject *uniobj;
6025 Py_UNICODE *unistr;
6026
6027 uniobj = PyUnicode_FromObject(obj);
6028 if (uniobj == NULL) {
6029 PyErr_SetString(PyExc_TypeError,
6030 "The fill character cannot be converted to Unicode");
6031 return 0;
6032 }
6033 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6034 PyErr_SetString(PyExc_TypeError,
6035 "The fill character must be exactly one character long");
6036 Py_DECREF(uniobj);
6037 return 0;
6038 }
6039 unistr = PyUnicode_AS_UNICODE(uniobj);
6040 *fillcharloc = unistr[0];
6041 Py_DECREF(uniobj);
6042 return 1;
6043}
6044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006045PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006046"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006048Return S centered in a Unicode string of length width. Padding is\n\
6049done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050
6051static PyObject *
6052unicode_center(PyUnicodeObject *self, PyObject *args)
6053{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006054 Py_ssize_t marg, left;
6055 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006056 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057
Thomas Woutersde017742006-02-16 19:34:37 +00006058 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 return NULL;
6060
Tim Peters7a29bd52001-09-12 03:03:31 +00006061 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 Py_INCREF(self);
6063 return (PyObject*) self;
6064 }
6065
6066 marg = width - self->length;
6067 left = marg / 2 + (marg & width & 1);
6068
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006069 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070}
6071
Marc-André Lemburge5034372000-08-08 08:04:29 +00006072#if 0
6073
6074/* This code should go into some future Unicode collation support
6075 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006076 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006077
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006078/* speedy UTF-16 code point order comparison */
6079/* gleaned from: */
6080/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6081
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006082static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006083{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006084 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006085 0, 0, 0, 0, 0, 0, 0, 0,
6086 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006087 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006088};
6089
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090static int
6091unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6092{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006093 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006094
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 Py_UNICODE *s1 = str1->str;
6096 Py_UNICODE *s2 = str2->str;
6097
6098 len1 = str1->length;
6099 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006100
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006102 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006103
6104 c1 = *s1++;
6105 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006106
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006107 if (c1 > (1<<11) * 26)
6108 c1 += utf16Fixup[c1>>11];
6109 if (c2 > (1<<11) * 26)
6110 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006111 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006112
6113 if (c1 != c2)
6114 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006115
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006116 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 }
6118
6119 return (len1 < len2) ? -1 : (len1 != len2);
6120}
6121
Marc-André Lemburge5034372000-08-08 08:04:29 +00006122#else
6123
6124static int
6125unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6126{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006127 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006128
6129 Py_UNICODE *s1 = str1->str;
6130 Py_UNICODE *s2 = str2->str;
6131
6132 len1 = str1->length;
6133 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006134
Marc-André Lemburge5034372000-08-08 08:04:29 +00006135 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006136 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006137
Fredrik Lundh45714e92001-06-26 16:39:36 +00006138 c1 = *s1++;
6139 c2 = *s2++;
6140
6141 if (c1 != c2)
6142 return (c1 < c2) ? -1 : 1;
6143
Marc-André Lemburge5034372000-08-08 08:04:29 +00006144 len1--; len2--;
6145 }
6146
6147 return (len1 < len2) ? -1 : (len1 != len2);
6148}
6149
6150#endif
6151
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152int PyUnicode_Compare(PyObject *left,
6153 PyObject *right)
6154{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006155 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6156 return unicode_compare((PyUnicodeObject *)left,
6157 (PyUnicodeObject *)right);
6158 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
6159 (PyUnicode_Check(left) && PyString_Check(right))) {
6160 if (PyUnicode_Check(left))
6161 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
6162 if (PyUnicode_Check(right))
6163 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
6164 assert(PyString_Check(left));
6165 assert(PyString_Check(right));
6166 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006168 PyErr_Format(PyExc_TypeError,
6169 "Can't compare %.100s and %.100s",
6170 left->ob_type->tp_name,
6171 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 return -1;
6173}
6174
Martin v. Löwis5b222132007-06-10 09:51:05 +00006175int
6176PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6177{
6178 int i;
6179 Py_UNICODE *id;
6180 assert(PyUnicode_Check(uni));
6181 id = PyUnicode_AS_UNICODE(uni);
6182 /* Compare Unicode string and source character set string */
6183 for (i = 0; id[i] && str[i]; i++)
6184 if (id[i] != str[i])
6185 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6186 if (id[i])
6187 return 1; /* uni is longer */
6188 if (str[i])
6189 return -1; /* str is longer */
6190 return 0;
6191}
6192
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006193PyObject *PyUnicode_RichCompare(PyObject *left,
6194 PyObject *right,
6195 int op)
6196{
6197 int result;
6198
6199 result = PyUnicode_Compare(left, right);
6200 if (result == -1 && PyErr_Occurred())
6201 goto onError;
6202
6203 /* Convert the return value to a Boolean */
6204 switch (op) {
6205 case Py_EQ:
6206 result = (result == 0);
6207 break;
6208 case Py_NE:
6209 result = (result != 0);
6210 break;
6211 case Py_LE:
6212 result = (result <= 0);
6213 break;
6214 case Py_GE:
6215 result = (result >= 0);
6216 break;
6217 case Py_LT:
6218 result = (result == -1);
6219 break;
6220 case Py_GT:
6221 result = (result == 1);
6222 break;
6223 }
6224 return PyBool_FromLong(result);
6225
6226 onError:
6227
6228 /* Standard case
6229
6230 Type errors mean that PyUnicode_FromObject() could not convert
6231 one of the arguments (usually the right hand side) to Unicode,
6232 ie. we can't handle the comparison request. However, it is
6233 possible that the other object knows a comparison method, which
6234 is why we return Py_NotImplemented to give the other object a
6235 chance.
6236
6237 */
6238 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6239 PyErr_Clear();
6240 Py_INCREF(Py_NotImplemented);
6241 return Py_NotImplemented;
6242 }
6243 if (op != Py_EQ && op != Py_NE)
6244 return NULL;
6245
6246 /* Equality comparison.
6247
6248 This is a special case: we silence any PyExc_UnicodeDecodeError
6249 and instead turn it into a PyErr_UnicodeWarning.
6250
6251 */
6252 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6253 return NULL;
6254 PyErr_Clear();
Skip Montanaro46fc3372007-08-12 11:44:53 +00006255 if (PyErr_WarnEx(PyExc_UnicodeWarning,
6256 (op == Py_EQ) ?
6257 "Unicode equal comparison "
6258 "failed to convert both arguments to Unicode - "
6259 "interpreting them as being unequal"
6260 :
6261 "Unicode unequal comparison "
6262 "failed to convert both arguments to Unicode - "
6263 "interpreting them as being unequal",
6264 1) < 0)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006265 return NULL;
6266 result = (op == Py_NE);
6267 return PyBool_FromLong(result);
6268}
6269
Guido van Rossum403d68b2000-03-13 15:55:09 +00006270int PyUnicode_Contains(PyObject *container,
6271 PyObject *element)
6272{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006273 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006274 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006275
6276 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006277 sub = PyUnicode_FromObject(element);
6278 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006279 PyErr_Format(PyExc_TypeError,
6280 "'in <string>' requires string as left operand, not %s",
6281 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006282 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006283 }
6284
Thomas Wouters477c8d52006-05-27 19:21:47 +00006285 str = PyUnicode_FromObject(container);
6286 if (!str) {
6287 Py_DECREF(sub);
6288 return -1;
6289 }
6290
6291 result = stringlib_contains_obj(str, sub);
6292
6293 Py_DECREF(str);
6294 Py_DECREF(sub);
6295
Guido van Rossum403d68b2000-03-13 15:55:09 +00006296 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006297}
6298
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299/* Concat to string or Unicode object giving a new Unicode object. */
6300
6301PyObject *PyUnicode_Concat(PyObject *left,
6302 PyObject *right)
6303{
6304 PyUnicodeObject *u = NULL, *v = NULL, *w;
6305
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006306 if (PyBytes_Check(left) || PyBytes_Check(right))
6307 return PyBytes_Concat(left, right);
6308
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 /* Coerce the two arguments */
6310 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6311 if (u == NULL)
6312 goto onError;
6313 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6314 if (v == NULL)
6315 goto onError;
6316
6317 /* Shortcuts */
6318 if (v == unicode_empty) {
6319 Py_DECREF(v);
6320 return (PyObject *)u;
6321 }
6322 if (u == unicode_empty) {
6323 Py_DECREF(u);
6324 return (PyObject *)v;
6325 }
6326
6327 /* Concat the two Unicode strings */
6328 w = _PyUnicode_New(u->length + v->length);
6329 if (w == NULL)
6330 goto onError;
6331 Py_UNICODE_COPY(w->str, u->str, u->length);
6332 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6333
6334 Py_DECREF(u);
6335 Py_DECREF(v);
6336 return (PyObject *)w;
6337
6338onError:
6339 Py_XDECREF(u);
6340 Py_XDECREF(v);
6341 return NULL;
6342}
6343
Walter Dörwald1ab83302007-05-18 17:15:44 +00006344void
6345PyUnicode_Append(PyObject **pleft, PyObject *right)
6346{
6347 PyObject *new;
6348 if (*pleft == NULL)
6349 return;
6350 if (right == NULL || !PyUnicode_Check(*pleft)) {
6351 Py_DECREF(*pleft);
6352 *pleft = NULL;
6353 return;
6354 }
6355 new = PyUnicode_Concat(*pleft, right);
6356 Py_DECREF(*pleft);
6357 *pleft = new;
6358}
6359
6360void
6361PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6362{
6363 PyUnicode_Append(pleft, right);
6364 Py_XDECREF(right);
6365}
6366
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006367PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368"S.count(sub[, start[, end]]) -> int\n\
6369\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006370Return the number of non-overlapping occurrences of substring sub in\n\
6371Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006372interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373
6374static PyObject *
6375unicode_count(PyUnicodeObject *self, PyObject *args)
6376{
6377 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006378 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006379 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 PyObject *result;
6381
Guido van Rossumb8872e62000-05-09 14:14:27 +00006382 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6383 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 return NULL;
6385
6386 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006387 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 if (substring == NULL)
6389 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006390
Thomas Wouters477c8d52006-05-27 19:21:47 +00006391 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392
Thomas Wouters477c8d52006-05-27 19:21:47 +00006393 result = PyInt_FromSsize_t(
6394 stringlib_count(self->str + start, end - start,
6395 substring->str, substring->length)
6396 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397
6398 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006399
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 return result;
6401}
6402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006403PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006404"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006406Encodes S using the codec registered for encoding. encoding defaults\n\
6407to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006408handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006409a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6410'xmlcharrefreplace' as well as any other name registered with\n\
6411codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412
6413static PyObject *
6414unicode_encode(PyUnicodeObject *self, PyObject *args)
6415{
6416 char *encoding = NULL;
6417 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006418 PyObject *v;
6419
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6421 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006422 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006423 if (v == NULL)
6424 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006425 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006426 if (PyString_Check(v)) {
6427 /* Old codec, turn it into bytes */
6428 PyObject *b = PyBytes_FromObject(v);
6429 Py_DECREF(v);
6430 return b;
6431 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006432 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006433 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006434 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006435 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006436 Py_DECREF(v);
6437 return NULL;
6438 }
6439 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006440
6441 onError:
6442 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006443}
6444
6445PyDoc_STRVAR(decode__doc__,
6446"S.decode([encoding[,errors]]) -> string or unicode\n\
6447\n\
6448Decodes S using the codec registered for encoding. encoding defaults\n\
6449to the default encoding. errors may be given to set a different error\n\
6450handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6451a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6452as well as any other name registerd with codecs.register_error that is\n\
6453able to handle UnicodeDecodeErrors.");
6454
6455static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006456unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006457{
6458 char *encoding = NULL;
6459 char *errors = NULL;
6460 PyObject *v;
6461
6462 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6463 return NULL;
6464 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006465 if (v == NULL)
6466 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006467 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6468 PyErr_Format(PyExc_TypeError,
6469 "decoder did not return a string/unicode object "
6470 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006471 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006472 Py_DECREF(v);
6473 return NULL;
6474 }
6475 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006476
6477 onError:
6478 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479}
6480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006481PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482"S.expandtabs([tabsize]) -> unicode\n\
6483\n\
6484Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006485If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486
6487static PyObject*
6488unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6489{
6490 Py_UNICODE *e;
6491 Py_UNICODE *p;
6492 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006493 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 PyUnicodeObject *u;
6495 int tabsize = 8;
6496
6497 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6498 return NULL;
6499
Thomas Wouters7e474022000-07-16 12:04:32 +00006500 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006501 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 e = self->str + self->length;
6503 for (p = self->str; p < e; p++)
6504 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006505 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006507 if (old_j > j) {
6508 PyErr_SetString(PyExc_OverflowError,
6509 "new string is too long");
6510 return NULL;
6511 }
6512 old_j = j;
6513 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 }
6515 else {
6516 j++;
6517 if (*p == '\n' || *p == '\r') {
6518 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006519 old_j = j = 0;
6520 if (i < 0) {
6521 PyErr_SetString(PyExc_OverflowError,
6522 "new string is too long");
6523 return NULL;
6524 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 }
6526 }
6527
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006528 if ((i + j) < 0) {
6529 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6530 return NULL;
6531 }
6532
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 /* Second pass: create output string and fill it */
6534 u = _PyUnicode_New(i + j);
6535 if (!u)
6536 return NULL;
6537
6538 j = 0;
6539 q = u->str;
6540
6541 for (p = self->str; p < e; p++)
6542 if (*p == '\t') {
6543 if (tabsize > 0) {
6544 i = tabsize - (j % tabsize);
6545 j += i;
6546 while (i--)
6547 *q++ = ' ';
6548 }
6549 }
6550 else {
6551 j++;
6552 *q++ = *p;
6553 if (*p == '\n' || *p == '\r')
6554 j = 0;
6555 }
6556
6557 return (PyObject*) u;
6558}
6559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006560PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561"S.find(sub [,start [,end]]) -> int\n\
6562\n\
6563Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006564such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565arguments start and end are interpreted as in slice notation.\n\
6566\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006567Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568
6569static PyObject *
6570unicode_find(PyUnicodeObject *self, PyObject *args)
6571{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006572 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006573 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006574 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006575 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576
Guido van Rossumb8872e62000-05-09 14:14:27 +00006577 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6578 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006580 substring = PyUnicode_FromObject(substring);
6581 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 return NULL;
6583
Thomas Wouters477c8d52006-05-27 19:21:47 +00006584 result = stringlib_find_slice(
6585 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6586 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6587 start, end
6588 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589
6590 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006591
6592 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593}
6594
6595static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006596unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597{
6598 if (index < 0 || index >= self->length) {
6599 PyErr_SetString(PyExc_IndexError, "string index out of range");
6600 return NULL;
6601 }
6602
6603 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6604}
6605
6606static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006607unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006609 /* Since Unicode objects compare equal to their UTF-8 string
6610 counterparts, we hash the UTF-8 string. */
6611 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6612 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613}
6614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006615PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616"S.index(sub [,start [,end]]) -> int\n\
6617\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006618Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619
6620static PyObject *
6621unicode_index(PyUnicodeObject *self, PyObject *args)
6622{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006623 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006624 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006625 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006626 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627
Guido van Rossumb8872e62000-05-09 14:14:27 +00006628 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6629 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006631 substring = PyUnicode_FromObject(substring);
6632 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 return NULL;
6634
Thomas Wouters477c8d52006-05-27 19:21:47 +00006635 result = stringlib_find_slice(
6636 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6637 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6638 start, end
6639 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640
6641 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006642
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 if (result < 0) {
6644 PyErr_SetString(PyExc_ValueError, "substring not found");
6645 return NULL;
6646 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006647
Martin v. Löwis18e16552006-02-15 17:27:45 +00006648 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649}
6650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006651PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006652"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006654Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006655at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656
6657static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006658unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659{
6660 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6661 register const Py_UNICODE *e;
6662 int cased;
6663
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 /* Shortcut for single character strings */
6665 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006666 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006668 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006669 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006670 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006671
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 e = p + PyUnicode_GET_SIZE(self);
6673 cased = 0;
6674 for (; p < e; p++) {
6675 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006676
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006678 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 else if (!cased && Py_UNICODE_ISLOWER(ch))
6680 cased = 1;
6681 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006682 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683}
6684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006685PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006686"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006688Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006689at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
6691static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006692unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693{
6694 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6695 register const Py_UNICODE *e;
6696 int cased;
6697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 /* Shortcut for single character strings */
6699 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006700 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006702 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006703 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006704 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006705
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 e = p + PyUnicode_GET_SIZE(self);
6707 cased = 0;
6708 for (; p < e; p++) {
6709 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006710
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006712 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 else if (!cased && Py_UNICODE_ISUPPER(ch))
6714 cased = 1;
6715 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006716 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006719PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006720"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006722Return True if S is a titlecased string and there is at least one\n\
6723character in S, i.e. upper- and titlecase characters may only\n\
6724follow uncased characters and lowercase characters only cased ones.\n\
6725Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726
6727static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006728unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729{
6730 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6731 register const Py_UNICODE *e;
6732 int cased, previous_is_cased;
6733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 /* Shortcut for single character strings */
6735 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006736 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6737 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006739 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006740 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006741 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006742
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 e = p + PyUnicode_GET_SIZE(self);
6744 cased = 0;
6745 previous_is_cased = 0;
6746 for (; p < e; p++) {
6747 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006748
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6750 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006751 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 previous_is_cased = 1;
6753 cased = 1;
6754 }
6755 else if (Py_UNICODE_ISLOWER(ch)) {
6756 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006757 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 previous_is_cased = 1;
6759 cased = 1;
6760 }
6761 else
6762 previous_is_cased = 0;
6763 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006764 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765}
6766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006767PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006768"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006770Return True if all characters in S are whitespace\n\
6771and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772
6773static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006774unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775{
6776 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6777 register const Py_UNICODE *e;
6778
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779 /* Shortcut for single character strings */
6780 if (PyUnicode_GET_SIZE(self) == 1 &&
6781 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006782 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006784 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006785 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006786 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006787
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 e = p + PyUnicode_GET_SIZE(self);
6789 for (; p < e; p++) {
6790 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006791 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006793 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794}
6795
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006796PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006797"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006798\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006799Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006800and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006801
6802static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006803unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006804{
6805 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6806 register const Py_UNICODE *e;
6807
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006808 /* Shortcut for single character strings */
6809 if (PyUnicode_GET_SIZE(self) == 1 &&
6810 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006811 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006812
6813 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006814 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006815 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006816
6817 e = p + PyUnicode_GET_SIZE(self);
6818 for (; p < e; p++) {
6819 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006820 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006821 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006822 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006823}
6824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006825PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006826"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006827\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006828Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006829and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006830
6831static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006832unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006833{
6834 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6835 register const Py_UNICODE *e;
6836
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006837 /* Shortcut for single character strings */
6838 if (PyUnicode_GET_SIZE(self) == 1 &&
6839 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006840 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006841
6842 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006843 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006844 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006845
6846 e = p + PyUnicode_GET_SIZE(self);
6847 for (; p < e; p++) {
6848 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006849 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006850 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006851 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006852}
6853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006854PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006855"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006857Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006858False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859
6860static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006861unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862{
6863 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6864 register const Py_UNICODE *e;
6865
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866 /* Shortcut for single character strings */
6867 if (PyUnicode_GET_SIZE(self) == 1 &&
6868 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006869 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006871 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006872 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006873 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006874
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 e = p + PyUnicode_GET_SIZE(self);
6876 for (; p < e; p++) {
6877 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006878 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006880 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881}
6882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006883PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006884"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006886Return True if all characters in S are digits\n\
6887and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888
6889static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006890unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891{
6892 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6893 register const Py_UNICODE *e;
6894
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 /* Shortcut for single character strings */
6896 if (PyUnicode_GET_SIZE(self) == 1 &&
6897 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006898 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006900 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006901 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006902 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006903
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 e = p + PyUnicode_GET_SIZE(self);
6905 for (; p < e; p++) {
6906 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006907 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006909 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910}
6911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006912PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006913"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006915Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006916False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917
6918static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006919unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920{
6921 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6922 register const Py_UNICODE *e;
6923
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 /* Shortcut for single character strings */
6925 if (PyUnicode_GET_SIZE(self) == 1 &&
6926 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006927 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006929 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006930 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006931 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006932
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933 e = p + PyUnicode_GET_SIZE(self);
6934 for (; p < e; p++) {
6935 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006936 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006938 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939}
6940
Martin v. Löwis47383402007-08-15 07:32:56 +00006941int
6942PyUnicode_IsIdentifier(PyObject *self)
6943{
6944 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6945 register const Py_UNICODE *e;
6946
6947 /* Special case for empty strings */
6948 if (PyUnicode_GET_SIZE(self) == 0)
6949 return 0;
6950
6951 /* PEP 3131 says that the first character must be in
6952 XID_Start and subsequent characters in XID_Continue,
6953 and for the ASCII range, the 2.x rules apply (i.e
6954 start with letters and underscore, continue with
6955 letters, digits, underscore). However, given the current
6956 definition of XID_Start and XID_Continue, it is sufficient
6957 to check just for these, except that _ must be allowed
6958 as starting an identifier. */
6959 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6960 return 0;
6961
6962 e = p + PyUnicode_GET_SIZE(self);
6963 for (p++; p < e; p++) {
6964 if (!_PyUnicode_IsXidContinue(*p))
6965 return 0;
6966 }
6967 return 1;
6968}
6969
6970PyDoc_STRVAR(isidentifier__doc__,
6971"S.isidentifier() -> bool\n\
6972\n\
6973Return True if S is a valid identifier according\n\
6974to the language definition.");
6975
6976static PyObject*
6977unicode_isidentifier(PyObject *self)
6978{
6979 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6980}
6981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006982PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983"S.join(sequence) -> unicode\n\
6984\n\
6985Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006986sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987
6988static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006989unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006991 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992}
6993
Martin v. Löwis18e16552006-02-15 17:27:45 +00006994static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995unicode_length(PyUnicodeObject *self)
6996{
6997 return self->length;
6998}
6999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007000PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00007001"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002\n\
7003Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007004done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005
7006static PyObject *
7007unicode_ljust(PyUnicodeObject *self, PyObject *args)
7008{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007009 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007010 Py_UNICODE fillchar = ' ';
7011
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007012 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013 return NULL;
7014
Tim Peters7a29bd52001-09-12 03:03:31 +00007015 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016 Py_INCREF(self);
7017 return (PyObject*) self;
7018 }
7019
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007020 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021}
7022
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007023PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024"S.lower() -> unicode\n\
7025\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007026Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027
7028static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007029unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 return fixup(self, fixlower);
7032}
7033
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007034#define LEFTSTRIP 0
7035#define RIGHTSTRIP 1
7036#define BOTHSTRIP 2
7037
7038/* Arrays indexed by above */
7039static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7040
7041#define STRIPNAME(i) (stripformat[i]+3)
7042
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007043/* externally visible for str.strip(unicode) */
7044PyObject *
7045_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7046{
7047 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007048 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007049 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007050 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7051 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007052
Thomas Wouters477c8d52006-05-27 19:21:47 +00007053 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7054
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007055 i = 0;
7056 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007057 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7058 i++;
7059 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007060 }
7061
7062 j = len;
7063 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007064 do {
7065 j--;
7066 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7067 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007068 }
7069
7070 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007071 Py_INCREF(self);
7072 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007073 }
7074 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007075 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007076}
7077
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078
7079static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007080do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007082 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007083 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007084
7085 i = 0;
7086 if (striptype != RIGHTSTRIP) {
7087 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7088 i++;
7089 }
7090 }
7091
7092 j = len;
7093 if (striptype != LEFTSTRIP) {
7094 do {
7095 j--;
7096 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7097 j++;
7098 }
7099
7100 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7101 Py_INCREF(self);
7102 return (PyObject*)self;
7103 }
7104 else
7105 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106}
7107
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007108
7109static PyObject *
7110do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7111{
7112 PyObject *sep = NULL;
7113
7114 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7115 return NULL;
7116
7117 if (sep != NULL && sep != Py_None) {
7118 if (PyUnicode_Check(sep))
7119 return _PyUnicode_XStrip(self, striptype, sep);
7120 else if (PyString_Check(sep)) {
7121 PyObject *res;
7122 sep = PyUnicode_FromObject(sep);
7123 if (sep==NULL)
7124 return NULL;
7125 res = _PyUnicode_XStrip(self, striptype, sep);
7126 Py_DECREF(sep);
7127 return res;
7128 }
7129 else {
7130 PyErr_Format(PyExc_TypeError,
7131 "%s arg must be None, unicode or str",
7132 STRIPNAME(striptype));
7133 return NULL;
7134 }
7135 }
7136
7137 return do_strip(self, striptype);
7138}
7139
7140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007141PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007142"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007143\n\
7144Return a copy of the string S with leading and trailing\n\
7145whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007146If chars is given and not None, remove characters in chars instead.\n\
7147If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007148
7149static PyObject *
7150unicode_strip(PyUnicodeObject *self, PyObject *args)
7151{
7152 if (PyTuple_GET_SIZE(args) == 0)
7153 return do_strip(self, BOTHSTRIP); /* Common case */
7154 else
7155 return do_argstrip(self, BOTHSTRIP, args);
7156}
7157
7158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007159PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007160"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007161\n\
7162Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007163If chars is given and not None, remove characters in chars instead.\n\
7164If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007165
7166static PyObject *
7167unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7168{
7169 if (PyTuple_GET_SIZE(args) == 0)
7170 return do_strip(self, LEFTSTRIP); /* Common case */
7171 else
7172 return do_argstrip(self, LEFTSTRIP, args);
7173}
7174
7175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007176PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007177"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007178\n\
7179Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007180If chars is given and not None, remove characters in chars instead.\n\
7181If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007182
7183static PyObject *
7184unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7185{
7186 if (PyTuple_GET_SIZE(args) == 0)
7187 return do_strip(self, RIGHTSTRIP); /* Common case */
7188 else
7189 return do_argstrip(self, RIGHTSTRIP, args);
7190}
7191
7192
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007194unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195{
7196 PyUnicodeObject *u;
7197 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007198 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007199 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200
7201 if (len < 0)
7202 len = 0;
7203
Tim Peters7a29bd52001-09-12 03:03:31 +00007204 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 /* no repeat, return original string */
7206 Py_INCREF(str);
7207 return (PyObject*) str;
7208 }
Tim Peters8f422462000-09-09 06:13:41 +00007209
7210 /* ensure # of chars needed doesn't overflow int and # of bytes
7211 * needed doesn't overflow size_t
7212 */
7213 nchars = len * str->length;
7214 if (len && nchars / len != str->length) {
7215 PyErr_SetString(PyExc_OverflowError,
7216 "repeated string is too long");
7217 return NULL;
7218 }
7219 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7220 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7221 PyErr_SetString(PyExc_OverflowError,
7222 "repeated string is too long");
7223 return NULL;
7224 }
7225 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226 if (!u)
7227 return NULL;
7228
7229 p = u->str;
7230
Thomas Wouters477c8d52006-05-27 19:21:47 +00007231 if (str->length == 1 && len > 0) {
7232 Py_UNICODE_FILL(p, str->str[0], len);
7233 } else {
7234 Py_ssize_t done = 0; /* number of characters copied this far */
7235 if (done < nchars) {
7236 Py_UNICODE_COPY(p, str->str, str->length);
7237 done = str->length;
7238 }
7239 while (done < nchars) {
7240 int n = (done <= nchars-done) ? done : nchars-done;
7241 Py_UNICODE_COPY(p+done, p, n);
7242 done += n;
7243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244 }
7245
7246 return (PyObject*) u;
7247}
7248
7249PyObject *PyUnicode_Replace(PyObject *obj,
7250 PyObject *subobj,
7251 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007252 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253{
7254 PyObject *self;
7255 PyObject *str1;
7256 PyObject *str2;
7257 PyObject *result;
7258
7259 self = PyUnicode_FromObject(obj);
7260 if (self == NULL)
7261 return NULL;
7262 str1 = PyUnicode_FromObject(subobj);
7263 if (str1 == NULL) {
7264 Py_DECREF(self);
7265 return NULL;
7266 }
7267 str2 = PyUnicode_FromObject(replobj);
7268 if (str2 == NULL) {
7269 Py_DECREF(self);
7270 Py_DECREF(str1);
7271 return NULL;
7272 }
Tim Petersced69f82003-09-16 20:30:58 +00007273 result = replace((PyUnicodeObject *)self,
7274 (PyUnicodeObject *)str1,
7275 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 maxcount);
7277 Py_DECREF(self);
7278 Py_DECREF(str1);
7279 Py_DECREF(str2);
7280 return result;
7281}
7282
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007283PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284"S.replace (old, new[, maxsplit]) -> unicode\n\
7285\n\
7286Return a copy of S with all occurrences of substring\n\
7287old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007288given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289
7290static PyObject*
7291unicode_replace(PyUnicodeObject *self, PyObject *args)
7292{
7293 PyUnicodeObject *str1;
7294 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007295 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296 PyObject *result;
7297
Martin v. Löwis18e16552006-02-15 17:27:45 +00007298 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299 return NULL;
7300 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7301 if (str1 == NULL)
7302 return NULL;
7303 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007304 if (str2 == NULL) {
7305 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308
7309 result = replace(self, str1, str2, maxcount);
7310
7311 Py_DECREF(str1);
7312 Py_DECREF(str2);
7313 return result;
7314}
7315
7316static
7317PyObject *unicode_repr(PyObject *unicode)
7318{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007319 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007320 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007321 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7322 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7323
7324 /* XXX(nnorwitz): rather than over-allocating, it would be
7325 better to choose a different scheme. Perhaps scan the
7326 first N-chars of the string and allocate based on that size.
7327 */
7328 /* Initial allocation is based on the longest-possible unichr
7329 escape.
7330
7331 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7332 unichr, so in this case it's the longest unichr escape. In
7333 narrow (UTF-16) builds this is five chars per source unichr
7334 since there are two unichrs in the surrogate pair, so in narrow
7335 (UTF-16) builds it's not the longest unichr escape.
7336
7337 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7338 so in the narrow (UTF-16) build case it's the longest unichr
7339 escape.
7340 */
7341
Walter Dörwald1ab83302007-05-18 17:15:44 +00007342 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007343 2 /* quotes */
7344#ifdef Py_UNICODE_WIDE
7345 + 10*size
7346#else
7347 + 6*size
7348#endif
7349 + 1);
7350 if (repr == NULL)
7351 return NULL;
7352
Walter Dörwald1ab83302007-05-18 17:15:44 +00007353 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007354
7355 /* Add quote */
7356 *p++ = (findchar(s, size, '\'') &&
7357 !findchar(s, size, '"')) ? '"' : '\'';
7358 while (size-- > 0) {
7359 Py_UNICODE ch = *s++;
7360
7361 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007362 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007363 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007364 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007365 continue;
7366 }
7367
7368#ifdef Py_UNICODE_WIDE
7369 /* Map 21-bit characters to '\U00xxxxxx' */
7370 else if (ch >= 0x10000) {
7371 *p++ = '\\';
7372 *p++ = 'U';
7373 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7374 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7375 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7376 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7377 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7378 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7379 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7380 *p++ = hexdigits[ch & 0x0000000F];
7381 continue;
7382 }
7383#else
7384 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7385 else if (ch >= 0xD800 && ch < 0xDC00) {
7386 Py_UNICODE ch2;
7387 Py_UCS4 ucs;
7388
7389 ch2 = *s++;
7390 size--;
7391 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7392 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7393 *p++ = '\\';
7394 *p++ = 'U';
7395 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7396 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7397 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7398 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7399 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7400 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7401 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7402 *p++ = hexdigits[ucs & 0x0000000F];
7403 continue;
7404 }
7405 /* Fall through: isolated surrogates are copied as-is */
7406 s--;
7407 size++;
7408 }
7409#endif
7410
7411 /* Map 16-bit characters to '\uxxxx' */
7412 if (ch >= 256) {
7413 *p++ = '\\';
7414 *p++ = 'u';
7415 *p++ = hexdigits[(ch >> 12) & 0x000F];
7416 *p++ = hexdigits[(ch >> 8) & 0x000F];
7417 *p++ = hexdigits[(ch >> 4) & 0x000F];
7418 *p++ = hexdigits[ch & 0x000F];
7419 }
7420
7421 /* Map special whitespace to '\t', \n', '\r' */
7422 else if (ch == '\t') {
7423 *p++ = '\\';
7424 *p++ = 't';
7425 }
7426 else if (ch == '\n') {
7427 *p++ = '\\';
7428 *p++ = 'n';
7429 }
7430 else if (ch == '\r') {
7431 *p++ = '\\';
7432 *p++ = 'r';
7433 }
7434
7435 /* Map non-printable US ASCII to '\xhh' */
7436 else if (ch < ' ' || ch >= 0x7F) {
7437 *p++ = '\\';
7438 *p++ = 'x';
7439 *p++ = hexdigits[(ch >> 4) & 0x000F];
7440 *p++ = hexdigits[ch & 0x000F];
7441 }
7442
7443 /* Copy everything else as-is */
7444 else
7445 *p++ = (char) ch;
7446 }
7447 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007448 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007449
7450 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007451 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007452 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453}
7454
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007455PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456"S.rfind(sub [,start [,end]]) -> int\n\
7457\n\
7458Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007459such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460arguments start and end are interpreted as in slice notation.\n\
7461\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007462Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463
7464static PyObject *
7465unicode_rfind(PyUnicodeObject *self, PyObject *args)
7466{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007467 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007468 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007469 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007470 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471
Guido van Rossumb8872e62000-05-09 14:14:27 +00007472 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7473 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007475 substring = PyUnicode_FromObject(substring);
7476 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477 return NULL;
7478
Thomas Wouters477c8d52006-05-27 19:21:47 +00007479 result = stringlib_rfind_slice(
7480 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7481 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7482 start, end
7483 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484
7485 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007486
7487 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488}
7489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007490PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491"S.rindex(sub [,start [,end]]) -> int\n\
7492\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007493Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494
7495static PyObject *
7496unicode_rindex(PyUnicodeObject *self, PyObject *args)
7497{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007498 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007499 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007500 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007501 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502
Guido van Rossumb8872e62000-05-09 14:14:27 +00007503 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7504 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007506 substring = PyUnicode_FromObject(substring);
7507 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 return NULL;
7509
Thomas Wouters477c8d52006-05-27 19:21:47 +00007510 result = stringlib_rfind_slice(
7511 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7512 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7513 start, end
7514 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515
7516 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007517
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518 if (result < 0) {
7519 PyErr_SetString(PyExc_ValueError, "substring not found");
7520 return NULL;
7521 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007522 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523}
7524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007525PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007526"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527\n\
7528Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007529done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530
7531static PyObject *
7532unicode_rjust(PyUnicodeObject *self, PyObject *args)
7533{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007534 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007535 Py_UNICODE fillchar = ' ';
7536
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007537 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538 return NULL;
7539
Tim Peters7a29bd52001-09-12 03:03:31 +00007540 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541 Py_INCREF(self);
7542 return (PyObject*) self;
7543 }
7544
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007545 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546}
7547
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007549unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550{
7551 /* standard clamping */
7552 if (start < 0)
7553 start = 0;
7554 if (end < 0)
7555 end = 0;
7556 if (end > self->length)
7557 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007558 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559 /* full slice, return original string */
7560 Py_INCREF(self);
7561 return (PyObject*) self;
7562 }
7563 if (start > end)
7564 start = end;
7565 /* copy slice */
7566 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7567 end - start);
7568}
7569
7570PyObject *PyUnicode_Split(PyObject *s,
7571 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007572 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573{
7574 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007575
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576 s = PyUnicode_FromObject(s);
7577 if (s == NULL)
7578 return NULL;
7579 if (sep != NULL) {
7580 sep = PyUnicode_FromObject(sep);
7581 if (sep == NULL) {
7582 Py_DECREF(s);
7583 return NULL;
7584 }
7585 }
7586
7587 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7588
7589 Py_DECREF(s);
7590 Py_XDECREF(sep);
7591 return result;
7592}
7593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007594PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595"S.split([sep [,maxsplit]]) -> list of strings\n\
7596\n\
7597Return a list of the words in S, using sep as the\n\
7598delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007599splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007600any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601
7602static PyObject*
7603unicode_split(PyUnicodeObject *self, PyObject *args)
7604{
7605 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007606 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607
Martin v. Löwis18e16552006-02-15 17:27:45 +00007608 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609 return NULL;
7610
7611 if (substring == Py_None)
7612 return split(self, NULL, maxcount);
7613 else if (PyUnicode_Check(substring))
7614 return split(self, (PyUnicodeObject *)substring, maxcount);
7615 else
7616 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7617}
7618
Thomas Wouters477c8d52006-05-27 19:21:47 +00007619PyObject *
7620PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7621{
7622 PyObject* str_obj;
7623 PyObject* sep_obj;
7624 PyObject* out;
7625
7626 str_obj = PyUnicode_FromObject(str_in);
7627 if (!str_obj)
7628 return NULL;
7629 sep_obj = PyUnicode_FromObject(sep_in);
7630 if (!sep_obj) {
7631 Py_DECREF(str_obj);
7632 return NULL;
7633 }
7634
7635 out = stringlib_partition(
7636 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7637 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7638 );
7639
7640 Py_DECREF(sep_obj);
7641 Py_DECREF(str_obj);
7642
7643 return out;
7644}
7645
7646
7647PyObject *
7648PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7649{
7650 PyObject* str_obj;
7651 PyObject* sep_obj;
7652 PyObject* out;
7653
7654 str_obj = PyUnicode_FromObject(str_in);
7655 if (!str_obj)
7656 return NULL;
7657 sep_obj = PyUnicode_FromObject(sep_in);
7658 if (!sep_obj) {
7659 Py_DECREF(str_obj);
7660 return NULL;
7661 }
7662
7663 out = stringlib_rpartition(
7664 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7665 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7666 );
7667
7668 Py_DECREF(sep_obj);
7669 Py_DECREF(str_obj);
7670
7671 return out;
7672}
7673
7674PyDoc_STRVAR(partition__doc__,
7675"S.partition(sep) -> (head, sep, tail)\n\
7676\n\
7677Searches for the separator sep in S, and returns the part before it,\n\
7678the separator itself, and the part after it. If the separator is not\n\
7679found, returns S and two empty strings.");
7680
7681static PyObject*
7682unicode_partition(PyUnicodeObject *self, PyObject *separator)
7683{
7684 return PyUnicode_Partition((PyObject *)self, separator);
7685}
7686
7687PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007688"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007689\n\
7690Searches for the separator sep in S, starting at the end of S, and returns\n\
7691the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007692separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007693
7694static PyObject*
7695unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7696{
7697 return PyUnicode_RPartition((PyObject *)self, separator);
7698}
7699
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007700PyObject *PyUnicode_RSplit(PyObject *s,
7701 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007702 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007703{
7704 PyObject *result;
7705
7706 s = PyUnicode_FromObject(s);
7707 if (s == NULL)
7708 return NULL;
7709 if (sep != NULL) {
7710 sep = PyUnicode_FromObject(sep);
7711 if (sep == NULL) {
7712 Py_DECREF(s);
7713 return NULL;
7714 }
7715 }
7716
7717 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7718
7719 Py_DECREF(s);
7720 Py_XDECREF(sep);
7721 return result;
7722}
7723
7724PyDoc_STRVAR(rsplit__doc__,
7725"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7726\n\
7727Return a list of the words in S, using sep as the\n\
7728delimiter string, starting at the end of the string and\n\
7729working to the front. If maxsplit is given, at most maxsplit\n\
7730splits are done. If sep is not specified, any whitespace string\n\
7731is a separator.");
7732
7733static PyObject*
7734unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7735{
7736 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007737 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007738
Martin v. Löwis18e16552006-02-15 17:27:45 +00007739 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007740 return NULL;
7741
7742 if (substring == Py_None)
7743 return rsplit(self, NULL, maxcount);
7744 else if (PyUnicode_Check(substring))
7745 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7746 else
7747 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7748}
7749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007750PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007751"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752\n\
7753Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007754Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007755is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756
7757static PyObject*
7758unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7759{
Guido van Rossum86662912000-04-11 15:38:46 +00007760 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761
Guido van Rossum86662912000-04-11 15:38:46 +00007762 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 return NULL;
7764
Guido van Rossum86662912000-04-11 15:38:46 +00007765 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766}
7767
7768static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007769PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770{
Walter Dörwald346737f2007-05-31 10:44:43 +00007771 if (PyUnicode_CheckExact(self)) {
7772 Py_INCREF(self);
7773 return self;
7774 } else
7775 /* Subtype -- return genuine unicode string with the same value. */
7776 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7777 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778}
7779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007780PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781"S.swapcase() -> unicode\n\
7782\n\
7783Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007784and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785
7786static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007787unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 return fixup(self, fixswapcase);
7790}
7791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007792PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793"S.translate(table) -> unicode\n\
7794\n\
7795Return a copy of the string S, where all characters have been mapped\n\
7796through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007797Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7798Unmapped characters are left untouched. Characters mapped to None\n\
7799are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800
7801static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007802unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803{
Tim Petersced69f82003-09-16 20:30:58 +00007804 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007806 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 "ignore");
7808}
7809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007810PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811"S.upper() -> unicode\n\
7812\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007813Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814
7815static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007816unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818 return fixup(self, fixupper);
7819}
7820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007821PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822"S.zfill(width) -> unicode\n\
7823\n\
7824Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007825of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826
7827static PyObject *
7828unicode_zfill(PyUnicodeObject *self, PyObject *args)
7829{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007830 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831 PyUnicodeObject *u;
7832
Martin v. Löwis18e16552006-02-15 17:27:45 +00007833 Py_ssize_t width;
7834 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835 return NULL;
7836
7837 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007838 if (PyUnicode_CheckExact(self)) {
7839 Py_INCREF(self);
7840 return (PyObject*) self;
7841 }
7842 else
7843 return PyUnicode_FromUnicode(
7844 PyUnicode_AS_UNICODE(self),
7845 PyUnicode_GET_SIZE(self)
7846 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847 }
7848
7849 fill = width - self->length;
7850
7851 u = pad(self, fill, 0, '0');
7852
Walter Dörwald068325e2002-04-15 13:36:47 +00007853 if (u == NULL)
7854 return NULL;
7855
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 if (u->str[fill] == '+' || u->str[fill] == '-') {
7857 /* move sign to beginning of string */
7858 u->str[0] = u->str[fill];
7859 u->str[fill] = '0';
7860 }
7861
7862 return (PyObject*) u;
7863}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864
7865#if 0
7866static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007867unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869 return PyInt_FromLong(unicode_freelist_size);
7870}
7871#endif
7872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007873PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007874"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007876Return True if S starts with the specified prefix, False otherwise.\n\
7877With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007878With optional end, stop comparing S at that position.\n\
7879prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880
7881static PyObject *
7882unicode_startswith(PyUnicodeObject *self,
7883 PyObject *args)
7884{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007885 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007887 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007888 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007889 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007891 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007892 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007894 if (PyTuple_Check(subobj)) {
7895 Py_ssize_t i;
7896 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7897 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7898 PyTuple_GET_ITEM(subobj, i));
7899 if (substring == NULL)
7900 return NULL;
7901 result = tailmatch(self, substring, start, end, -1);
7902 Py_DECREF(substring);
7903 if (result) {
7904 Py_RETURN_TRUE;
7905 }
7906 }
7907 /* nothing matched */
7908 Py_RETURN_FALSE;
7909 }
7910 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007912 return NULL;
7913 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007915 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916}
7917
7918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007919PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007920"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007922Return True if S ends with the specified suffix, False otherwise.\n\
7923With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007924With optional end, stop comparing S at that position.\n\
7925suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926
7927static PyObject *
7928unicode_endswith(PyUnicodeObject *self,
7929 PyObject *args)
7930{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007931 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007933 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007934 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007935 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007937 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7938 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007940 if (PyTuple_Check(subobj)) {
7941 Py_ssize_t i;
7942 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7943 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7944 PyTuple_GET_ITEM(subobj, i));
7945 if (substring == NULL)
7946 return NULL;
7947 result = tailmatch(self, substring, start, end, +1);
7948 Py_DECREF(substring);
7949 if (result) {
7950 Py_RETURN_TRUE;
7951 }
7952 }
7953 Py_RETURN_FALSE;
7954 }
7955 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007959 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007961 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962}
7963
7964
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007965
7966static PyObject *
7967unicode_getnewargs(PyUnicodeObject *v)
7968{
7969 return Py_BuildValue("(u#)", v->str, v->length);
7970}
7971
7972
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973static PyMethodDef unicode_methods[] = {
7974
7975 /* Order is according to common usage: often used methods should
7976 appear first, since lookup is done sequentially. */
7977
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007978 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7979 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7980 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007981 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007982 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7983 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7984 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7985 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7986 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7987 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7988 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007989 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007990 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7991 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7992 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007993 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007994 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007995/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7996 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7997 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7998 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007999 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008000 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008001 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008002 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008003 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8004 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8005 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8006 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8007 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8008 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8009 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8010 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8011 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8012 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8013 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8014 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8015 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8016 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008017 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008018 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008019#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008020 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021#endif
8022
8023#if 0
8024 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008025 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026#endif
8027
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008028 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029 {NULL, NULL}
8030};
8031
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008032static PyObject *
8033unicode_mod(PyObject *v, PyObject *w)
8034{
8035 if (!PyUnicode_Check(v)) {
8036 Py_INCREF(Py_NotImplemented);
8037 return Py_NotImplemented;
8038 }
8039 return PyUnicode_Format(v, w);
8040}
8041
8042static PyNumberMethods unicode_as_number = {
8043 0, /*nb_add*/
8044 0, /*nb_subtract*/
8045 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008046 unicode_mod, /*nb_remainder*/
8047};
8048
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008050 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008051 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008052 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8053 (ssizeargfunc) unicode_getitem, /* sq_item */
8054 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 0, /* sq_ass_item */
8056 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008057 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058};
8059
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008060static PyObject*
8061unicode_subscript(PyUnicodeObject* self, PyObject* item)
8062{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008063 if (PyIndex_Check(item)) {
8064 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008065 if (i == -1 && PyErr_Occurred())
8066 return NULL;
8067 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008068 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008069 return unicode_getitem(self, i);
8070 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008071 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008072 Py_UNICODE* source_buf;
8073 Py_UNICODE* result_buf;
8074 PyObject* result;
8075
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008076 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008077 &start, &stop, &step, &slicelength) < 0) {
8078 return NULL;
8079 }
8080
8081 if (slicelength <= 0) {
8082 return PyUnicode_FromUnicode(NULL, 0);
8083 } else {
8084 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008085 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8086 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008087
8088 if (result_buf == NULL)
8089 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008090
8091 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8092 result_buf[i] = source_buf[cur];
8093 }
Tim Petersced69f82003-09-16 20:30:58 +00008094
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008095 result = PyUnicode_FromUnicode(result_buf, slicelength);
8096 PyMem_FREE(result_buf);
8097 return result;
8098 }
8099 } else {
8100 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8101 return NULL;
8102 }
8103}
8104
8105static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008106 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008107 (binaryfunc)unicode_subscript, /* mp_subscript */
8108 (objobjargproc)0, /* mp_ass_subscript */
8109};
8110
Martin v. Löwis18e16552006-02-15 17:27:45 +00008111static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008113 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114 const void **ptr)
8115{
8116 if (index != 0) {
8117 PyErr_SetString(PyExc_SystemError,
8118 "accessing non-existent unicode segment");
8119 return -1;
8120 }
8121 *ptr = (void *) self->str;
8122 return PyUnicode_GET_DATA_SIZE(self);
8123}
8124
Martin v. Löwis18e16552006-02-15 17:27:45 +00008125static Py_ssize_t
8126unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127 const void **ptr)
8128{
8129 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00008130 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 return -1;
8132}
8133
8134static int
8135unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008136 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137{
8138 if (lenp)
8139 *lenp = PyUnicode_GET_DATA_SIZE(self);
8140 return 1;
8141}
8142
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008143static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008145 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 const void **ptr)
8147{
8148 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008149
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 if (index != 0) {
8151 PyErr_SetString(PyExc_SystemError,
8152 "accessing non-existent unicode segment");
8153 return -1;
8154 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008155 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156 if (str == NULL)
8157 return -1;
8158 *ptr = (void *) PyString_AS_STRING(str);
8159 return PyString_GET_SIZE(str);
8160}
8161
8162/* Helpers for PyUnicode_Format() */
8163
8164static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008165getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008167 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168 if (argidx < arglen) {
8169 (*p_argidx)++;
8170 if (arglen < 0)
8171 return args;
8172 else
8173 return PyTuple_GetItem(args, argidx);
8174 }
8175 PyErr_SetString(PyExc_TypeError,
8176 "not enough arguments for format string");
8177 return NULL;
8178}
8179
8180#define F_LJUST (1<<0)
8181#define F_SIGN (1<<1)
8182#define F_BLANK (1<<2)
8183#define F_ALT (1<<3)
8184#define F_ZERO (1<<4)
8185
Martin v. Löwis18e16552006-02-15 17:27:45 +00008186static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008187strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008189 register Py_ssize_t i;
8190 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191 for (i = len - 1; i >= 0; i--)
8192 buffer[i] = (Py_UNICODE) charbuffer[i];
8193
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194 return len;
8195}
8196
Neal Norwitzfc76d632006-01-10 06:03:13 +00008197static int
8198doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8199{
Tim Peters15231542006-02-16 01:08:01 +00008200 Py_ssize_t result;
8201
Neal Norwitzfc76d632006-01-10 06:03:13 +00008202 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008203 result = strtounicode(buffer, (char *)buffer);
8204 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008205}
8206
8207static int
8208longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8209{
Tim Peters15231542006-02-16 01:08:01 +00008210 Py_ssize_t result;
8211
Neal Norwitzfc76d632006-01-10 06:03:13 +00008212 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008213 result = strtounicode(buffer, (char *)buffer);
8214 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008215}
8216
Guido van Rossum078151d2002-08-11 04:24:12 +00008217/* XXX To save some code duplication, formatfloat/long/int could have been
8218 shared with stringobject.c, converting from 8-bit to Unicode after the
8219 formatting is done. */
8220
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221static int
8222formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008223 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 int flags,
8225 int prec,
8226 int type,
8227 PyObject *v)
8228{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008229 /* fmt = '%#.' + `prec` + `type`
8230 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231 char fmt[20];
8232 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008233
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234 x = PyFloat_AsDouble(v);
8235 if (x == -1.0 && PyErr_Occurred())
8236 return -1;
8237 if (prec < 0)
8238 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8240 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008241 /* Worst case length calc to ensure no buffer overrun:
8242
8243 'g' formats:
8244 fmt = %#.<prec>g
8245 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8246 for any double rep.)
8247 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8248
8249 'f' formats:
8250 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8251 len = 1 + 50 + 1 + prec = 52 + prec
8252
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008253 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008254 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008255
8256 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008257 if (((type == 'g' || type == 'G') &&
8258 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008259 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008260 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008261 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008262 return -1;
8263 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008264 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8265 (flags&F_ALT) ? "#" : "",
8266 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008267 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268}
8269
Tim Peters38fd5b62000-09-21 05:43:11 +00008270static PyObject*
8271formatlong(PyObject *val, int flags, int prec, int type)
8272{
8273 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008274 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008275 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008276 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008277
8278 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8279 if (!str)
8280 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008281 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008282 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008283 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008284}
8285
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286static int
8287formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008288 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 int flags,
8290 int prec,
8291 int type,
8292 PyObject *v)
8293{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008294 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008295 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8296 * + 1 + 1
8297 * = 24
8298 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008299 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008300 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301 long x;
8302
8303 x = PyInt_AsLong(v);
8304 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008305 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008306 if (x < 0 && type == 'u') {
8307 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008308 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008309 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8310 sign = "-";
8311 else
8312 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008314 prec = 1;
8315
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008316 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8317 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008318 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008319 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008320 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008321 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008322 return -1;
8323 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008324
8325 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008326 (type == 'x' || type == 'X' || type == 'o')) {
8327 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008328 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008329 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008330 * - when 0 is being converted, the C standard leaves off
8331 * the '0x' or '0X', which is inconsistent with other
8332 * %#x/%#X conversions and inconsistent with Python's
8333 * hex() function
8334 * - there are platforms that violate the standard and
8335 * convert 0 with the '0x' or '0X'
8336 * (Metrowerks, Compaq Tru64)
8337 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008338 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008339 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008340 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008341 * We can achieve the desired consistency by inserting our
8342 * own '0x' or '0X' prefix, and substituting %x/%X in place
8343 * of %#x/%#X.
8344 *
8345 * Note that this is the same approach as used in
8346 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008347 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008348 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8349 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008350 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008351 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008352 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8353 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008354 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008355 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008356 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008357 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008358 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008359 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360}
8361
8362static int
8363formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008364 size_t buflen,
8365 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008367 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008368 if (PyUnicode_Check(v)) {
8369 if (PyUnicode_GET_SIZE(v) != 1)
8370 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008374 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008375 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008376 goto onError;
8377 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8378 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379
8380 else {
8381 /* Integer input truncated to a character */
8382 long x;
8383 x = PyInt_AsLong(v);
8384 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008385 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008386#ifdef Py_UNICODE_WIDE
8387 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008388 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008389 "%c arg not in range(0x110000) "
8390 "(wide Python build)");
8391 return -1;
8392 }
8393#else
8394 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008395 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008396 "%c arg not in range(0x10000) "
8397 "(narrow Python build)");
8398 return -1;
8399 }
8400#endif
8401 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008402 }
8403 buf[1] = '\0';
8404 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008405
8406 onError:
8407 PyErr_SetString(PyExc_TypeError,
8408 "%c requires int or char");
8409 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410}
8411
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008412/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8413
8414 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8415 chars are formatted. XXX This is a magic number. Each formatting
8416 routine does bounds checking to ensure no overflow, but a better
8417 solution may be to malloc a buffer of appropriate size for each
8418 format. For now, the current solution is sufficient.
8419*/
8420#define FORMATBUFLEN (size_t)120
8421
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422PyObject *PyUnicode_Format(PyObject *format,
8423 PyObject *args)
8424{
8425 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008426 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427 int args_owned = 0;
8428 PyUnicodeObject *result = NULL;
8429 PyObject *dict = NULL;
8430 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008431
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 if (format == NULL || args == NULL) {
8433 PyErr_BadInternalCall();
8434 return NULL;
8435 }
8436 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008437 if (uformat == NULL)
8438 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439 fmt = PyUnicode_AS_UNICODE(uformat);
8440 fmtcnt = PyUnicode_GET_SIZE(uformat);
8441
8442 reslen = rescnt = fmtcnt + 100;
8443 result = _PyUnicode_New(reslen);
8444 if (result == NULL)
8445 goto onError;
8446 res = PyUnicode_AS_UNICODE(result);
8447
8448 if (PyTuple_Check(args)) {
8449 arglen = PyTuple_Size(args);
8450 argidx = 0;
8451 }
8452 else {
8453 arglen = -1;
8454 argidx = -2;
8455 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008456 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008457 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458 dict = args;
8459
8460 while (--fmtcnt >= 0) {
8461 if (*fmt != '%') {
8462 if (--rescnt < 0) {
8463 rescnt = fmtcnt + 100;
8464 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008465 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008466 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8468 --rescnt;
8469 }
8470 *res++ = *fmt++;
8471 }
8472 else {
8473 /* Got a format specifier */
8474 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008475 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 Py_UNICODE c = '\0';
8478 Py_UNICODE fill;
8479 PyObject *v = NULL;
8480 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008481 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008483 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008484 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485
8486 fmt++;
8487 if (*fmt == '(') {
8488 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008489 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490 PyObject *key;
8491 int pcount = 1;
8492
8493 if (dict == NULL) {
8494 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008495 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496 goto onError;
8497 }
8498 ++fmt;
8499 --fmtcnt;
8500 keystart = fmt;
8501 /* Skip over balanced parentheses */
8502 while (pcount > 0 && --fmtcnt >= 0) {
8503 if (*fmt == ')')
8504 --pcount;
8505 else if (*fmt == '(')
8506 ++pcount;
8507 fmt++;
8508 }
8509 keylen = fmt - keystart - 1;
8510 if (fmtcnt < 0 || pcount > 0) {
8511 PyErr_SetString(PyExc_ValueError,
8512 "incomplete format key");
8513 goto onError;
8514 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008515#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008516 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517 then looked up since Python uses strings to hold
8518 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008519 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 key = PyUnicode_EncodeUTF8(keystart,
8521 keylen,
8522 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008523#else
8524 key = PyUnicode_FromUnicode(keystart, keylen);
8525#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526 if (key == NULL)
8527 goto onError;
8528 if (args_owned) {
8529 Py_DECREF(args);
8530 args_owned = 0;
8531 }
8532 args = PyObject_GetItem(dict, key);
8533 Py_DECREF(key);
8534 if (args == NULL) {
8535 goto onError;
8536 }
8537 args_owned = 1;
8538 arglen = -1;
8539 argidx = -2;
8540 }
8541 while (--fmtcnt >= 0) {
8542 switch (c = *fmt++) {
8543 case '-': flags |= F_LJUST; continue;
8544 case '+': flags |= F_SIGN; continue;
8545 case ' ': flags |= F_BLANK; continue;
8546 case '#': flags |= F_ALT; continue;
8547 case '0': flags |= F_ZERO; continue;
8548 }
8549 break;
8550 }
8551 if (c == '*') {
8552 v = getnextarg(args, arglen, &argidx);
8553 if (v == NULL)
8554 goto onError;
8555 if (!PyInt_Check(v)) {
8556 PyErr_SetString(PyExc_TypeError,
8557 "* wants int");
8558 goto onError;
8559 }
8560 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008561 if (width == -1 && PyErr_Occurred())
8562 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 if (width < 0) {
8564 flags |= F_LJUST;
8565 width = -width;
8566 }
8567 if (--fmtcnt >= 0)
8568 c = *fmt++;
8569 }
8570 else if (c >= '0' && c <= '9') {
8571 width = c - '0';
8572 while (--fmtcnt >= 0) {
8573 c = *fmt++;
8574 if (c < '0' || c > '9')
8575 break;
8576 if ((width*10) / 10 != width) {
8577 PyErr_SetString(PyExc_ValueError,
8578 "width too big");
8579 goto onError;
8580 }
8581 width = width*10 + (c - '0');
8582 }
8583 }
8584 if (c == '.') {
8585 prec = 0;
8586 if (--fmtcnt >= 0)
8587 c = *fmt++;
8588 if (c == '*') {
8589 v = getnextarg(args, arglen, &argidx);
8590 if (v == NULL)
8591 goto onError;
8592 if (!PyInt_Check(v)) {
8593 PyErr_SetString(PyExc_TypeError,
8594 "* wants int");
8595 goto onError;
8596 }
8597 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008598 if (prec == -1 && PyErr_Occurred())
8599 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 if (prec < 0)
8601 prec = 0;
8602 if (--fmtcnt >= 0)
8603 c = *fmt++;
8604 }
8605 else if (c >= '0' && c <= '9') {
8606 prec = c - '0';
8607 while (--fmtcnt >= 0) {
8608 c = Py_CHARMASK(*fmt++);
8609 if (c < '0' || c > '9')
8610 break;
8611 if ((prec*10) / 10 != prec) {
8612 PyErr_SetString(PyExc_ValueError,
8613 "prec too big");
8614 goto onError;
8615 }
8616 prec = prec*10 + (c - '0');
8617 }
8618 }
8619 } /* prec */
8620 if (fmtcnt >= 0) {
8621 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 if (--fmtcnt >= 0)
8623 c = *fmt++;
8624 }
8625 }
8626 if (fmtcnt < 0) {
8627 PyErr_SetString(PyExc_ValueError,
8628 "incomplete format");
8629 goto onError;
8630 }
8631 if (c != '%') {
8632 v = getnextarg(args, arglen, &argidx);
8633 if (v == NULL)
8634 goto onError;
8635 }
8636 sign = 0;
8637 fill = ' ';
8638 switch (c) {
8639
8640 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008641 pbuf = formatbuf;
8642 /* presume that buffer length is at least 1 */
8643 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644 len = 1;
8645 break;
8646
8647 case 's':
8648 case 'r':
8649 if (PyUnicode_Check(v) && c == 's') {
8650 temp = v;
8651 Py_INCREF(temp);
8652 }
8653 else {
8654 PyObject *unicode;
8655 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008656 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657 else
8658 temp = PyObject_Repr(v);
8659 if (temp == NULL)
8660 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008661 if (PyUnicode_Check(temp))
8662 /* nothing to do */;
8663 else if (PyString_Check(temp)) {
8664 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008665 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008667 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008669 Py_DECREF(temp);
8670 temp = unicode;
8671 if (temp == NULL)
8672 goto onError;
8673 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008674 else {
8675 Py_DECREF(temp);
8676 PyErr_SetString(PyExc_TypeError,
8677 "%s argument has non-string str()");
8678 goto onError;
8679 }
8680 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008681 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 len = PyUnicode_GET_SIZE(temp);
8683 if (prec >= 0 && len > prec)
8684 len = prec;
8685 break;
8686
8687 case 'i':
8688 case 'd':
8689 case 'u':
8690 case 'o':
8691 case 'x':
8692 case 'X':
8693 if (c == 'i')
8694 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008695 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008696 temp = formatlong(v, flags, prec, c);
8697 if (!temp)
8698 goto onError;
8699 pbuf = PyUnicode_AS_UNICODE(temp);
8700 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008701 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008703 else {
8704 pbuf = formatbuf;
8705 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8706 flags, prec, c, v);
8707 if (len < 0)
8708 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008709 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008710 }
8711 if (flags & F_ZERO)
8712 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713 break;
8714
8715 case 'e':
8716 case 'E':
8717 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008718 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719 case 'g':
8720 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008721 if (c == 'F')
8722 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008723 pbuf = formatbuf;
8724 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8725 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726 if (len < 0)
8727 goto onError;
8728 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008729 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730 fill = '0';
8731 break;
8732
8733 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008734 pbuf = formatbuf;
8735 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736 if (len < 0)
8737 goto onError;
8738 break;
8739
8740 default:
8741 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008742 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008743 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008744 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008745 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008746 (Py_ssize_t)(fmt - 1 -
8747 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 goto onError;
8749 }
8750 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008751 if (*pbuf == '-' || *pbuf == '+') {
8752 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 len--;
8754 }
8755 else if (flags & F_SIGN)
8756 sign = '+';
8757 else if (flags & F_BLANK)
8758 sign = ' ';
8759 else
8760 sign = 0;
8761 }
8762 if (width < len)
8763 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008764 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765 reslen -= rescnt;
8766 rescnt = width + fmtcnt + 100;
8767 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008768 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008769 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008770 PyErr_NoMemory();
8771 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008772 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008773 if (_PyUnicode_Resize(&result, reslen) < 0) {
8774 Py_XDECREF(temp);
8775 goto onError;
8776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777 res = PyUnicode_AS_UNICODE(result)
8778 + reslen - rescnt;
8779 }
8780 if (sign) {
8781 if (fill != ' ')
8782 *res++ = sign;
8783 rescnt--;
8784 if (width > len)
8785 width--;
8786 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008787 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008788 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008789 assert(pbuf[1] == c);
8790 if (fill != ' ') {
8791 *res++ = *pbuf++;
8792 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008793 }
Tim Petersfff53252001-04-12 18:38:48 +00008794 rescnt -= 2;
8795 width -= 2;
8796 if (width < 0)
8797 width = 0;
8798 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008799 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 if (width > len && !(flags & F_LJUST)) {
8801 do {
8802 --rescnt;
8803 *res++ = fill;
8804 } while (--width > len);
8805 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008806 if (fill == ' ') {
8807 if (sign)
8808 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008809 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008810 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008811 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008812 *res++ = *pbuf++;
8813 *res++ = *pbuf++;
8814 }
8815 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008816 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008817 res += len;
8818 rescnt -= len;
8819 while (--width >= len) {
8820 --rescnt;
8821 *res++ = ' ';
8822 }
8823 if (dict && (argidx < arglen) && c != '%') {
8824 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008825 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008826 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827 goto onError;
8828 }
8829 Py_XDECREF(temp);
8830 } /* '%' */
8831 } /* until end */
8832 if (argidx < arglen && !dict) {
8833 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008834 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835 goto onError;
8836 }
8837
Thomas Woutersa96affe2006-03-12 00:29:36 +00008838 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8839 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840 if (args_owned) {
8841 Py_DECREF(args);
8842 }
8843 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844 return (PyObject *)result;
8845
8846 onError:
8847 Py_XDECREF(result);
8848 Py_DECREF(uformat);
8849 if (args_owned) {
8850 Py_DECREF(args);
8851 }
8852 return NULL;
8853}
8854
8855static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008856 (readbufferproc) unicode_buffer_getreadbuf,
8857 (writebufferproc) unicode_buffer_getwritebuf,
8858 (segcountproc) unicode_buffer_getsegcount,
8859 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860};
8861
Jeremy Hylton938ace62002-07-17 16:30:39 +00008862static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008863unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8864
Tim Peters6d6c1a32001-08-02 04:15:00 +00008865static PyObject *
8866unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8867{
8868 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008869 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008870 char *encoding = NULL;
8871 char *errors = NULL;
8872
Guido van Rossume023fe02001-08-30 03:12:59 +00008873 if (type != &PyUnicode_Type)
8874 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008875 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8876 kwlist, &x, &encoding, &errors))
8877 return NULL;
8878 if (x == NULL)
8879 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008880 if (encoding == NULL && errors == NULL)
8881 return PyObject_Unicode(x);
8882 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008883 return PyUnicode_FromEncodedObject(x, encoding, errors);
8884}
8885
Guido van Rossume023fe02001-08-30 03:12:59 +00008886static PyObject *
8887unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8888{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008889 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008890 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008891
8892 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8893 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8894 if (tmp == NULL)
8895 return NULL;
8896 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008897 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008898 if (pnew == NULL) {
8899 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008900 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008901 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008902 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8903 if (pnew->str == NULL) {
8904 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008905 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008906 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008907 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008908 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008909 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8910 pnew->length = n;
8911 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008912 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008913 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008914}
8915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008916PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008917"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008918\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008919Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008920encoding defaults to the current default string encoding.\n\
8921errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008922
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008923static PyObject *unicode_iter(PyObject *seq);
8924
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008926 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008927 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928 sizeof(PyUnicodeObject), /* tp_size */
8929 0, /* tp_itemsize */
8930 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008931 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008933 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008935 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008936 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008937 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008939 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940 (hashfunc) unicode_hash, /* tp_hash*/
8941 0, /* tp_call*/
8942 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008943 PyObject_GenericGetAttr, /* tp_getattro */
8944 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008946 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8947 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008948 unicode_doc, /* tp_doc */
8949 0, /* tp_traverse */
8950 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008951 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008952 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008953 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008954 0, /* tp_iternext */
8955 unicode_methods, /* tp_methods */
8956 0, /* tp_members */
8957 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008958 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008959 0, /* tp_dict */
8960 0, /* tp_descr_get */
8961 0, /* tp_descr_set */
8962 0, /* tp_dictoffset */
8963 0, /* tp_init */
8964 0, /* tp_alloc */
8965 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008966 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967};
8968
8969/* Initialize the Unicode implementation */
8970
Thomas Wouters78890102000-07-22 19:25:51 +00008971void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008973 int i;
8974
Thomas Wouters477c8d52006-05-27 19:21:47 +00008975 /* XXX - move this array to unicodectype.c ? */
8976 Py_UNICODE linebreak[] = {
8977 0x000A, /* LINE FEED */
8978 0x000D, /* CARRIAGE RETURN */
8979 0x001C, /* FILE SEPARATOR */
8980 0x001D, /* GROUP SEPARATOR */
8981 0x001E, /* RECORD SEPARATOR */
8982 0x0085, /* NEXT LINE */
8983 0x2028, /* LINE SEPARATOR */
8984 0x2029, /* PARAGRAPH SEPARATOR */
8985 };
8986
Fred Drakee4315f52000-05-09 19:53:39 +00008987 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008988 unicode_freelist = NULL;
8989 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008991 if (!unicode_empty)
8992 return;
8993
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008994 for (i = 0; i < 256; i++)
8995 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008996 if (PyType_Ready(&PyUnicode_Type) < 0)
8997 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008998
8999 /* initialize the linebreak bloom filter */
9000 bloom_linebreak = make_bloom_mask(
9001 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9002 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009003
9004 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005}
9006
9007/* Finalize the Unicode implementation */
9008
9009void
Thomas Wouters78890102000-07-22 19:25:51 +00009010_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009012 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009013 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009015 Py_XDECREF(unicode_empty);
9016 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009017
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009018 for (i = 0; i < 256; i++) {
9019 if (unicode_latin1[i]) {
9020 Py_DECREF(unicode_latin1[i]);
9021 unicode_latin1[i] = NULL;
9022 }
9023 }
9024
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009025 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026 PyUnicodeObject *v = u;
9027 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00009028 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00009029 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00009030 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009031 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009033 unicode_freelist = NULL;
9034 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009035}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009036
Walter Dörwald16807132007-05-25 13:52:07 +00009037void
9038PyUnicode_InternInPlace(PyObject **p)
9039{
9040 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9041 PyObject *t;
9042 if (s == NULL || !PyUnicode_Check(s))
9043 Py_FatalError(
9044 "PyUnicode_InternInPlace: unicode strings only please!");
9045 /* If it's a subclass, we don't really know what putting
9046 it in the interned dict might do. */
9047 if (!PyUnicode_CheckExact(s))
9048 return;
9049 if (PyUnicode_CHECK_INTERNED(s))
9050 return;
9051 if (interned == NULL) {
9052 interned = PyDict_New();
9053 if (interned == NULL) {
9054 PyErr_Clear(); /* Don't leave an exception */
9055 return;
9056 }
9057 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009058 /* It might be that the GetItem call fails even
9059 though the key is present in the dictionary,
9060 namely when this happens during a stack overflow. */
9061 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009062 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009063 Py_END_ALLOW_RECURSION
9064
Walter Dörwald16807132007-05-25 13:52:07 +00009065 if (t) {
9066 Py_INCREF(t);
9067 Py_DECREF(*p);
9068 *p = t;
9069 return;
9070 }
9071
Martin v. Löwis5b222132007-06-10 09:51:05 +00009072 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009073 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9074 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009075 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009076 return;
9077 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009078 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009079 /* The two references in interned are not counted by refcnt.
9080 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009081 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009082 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9083}
9084
9085void
9086PyUnicode_InternImmortal(PyObject **p)
9087{
9088 PyUnicode_InternInPlace(p);
9089 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9090 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9091 Py_INCREF(*p);
9092 }
9093}
9094
9095PyObject *
9096PyUnicode_InternFromString(const char *cp)
9097{
9098 PyObject *s = PyUnicode_FromString(cp);
9099 if (s == NULL)
9100 return NULL;
9101 PyUnicode_InternInPlace(&s);
9102 return s;
9103}
9104
9105void _Py_ReleaseInternedUnicodeStrings(void)
9106{
9107 PyObject *keys;
9108 PyUnicodeObject *s;
9109 Py_ssize_t i, n;
9110 Py_ssize_t immortal_size = 0, mortal_size = 0;
9111
9112 if (interned == NULL || !PyDict_Check(interned))
9113 return;
9114 keys = PyDict_Keys(interned);
9115 if (keys == NULL || !PyList_Check(keys)) {
9116 PyErr_Clear();
9117 return;
9118 }
9119
9120 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9121 detector, interned unicode strings are not forcibly deallocated;
9122 rather, we give them their stolen references back, and then clear
9123 and DECREF the interned dict. */
9124
9125 n = PyList_GET_SIZE(keys);
9126 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9127 n);
9128 for (i = 0; i < n; i++) {
9129 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9130 switch (s->state) {
9131 case SSTATE_NOT_INTERNED:
9132 /* XXX Shouldn't happen */
9133 break;
9134 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009135 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009136 immortal_size += s->length;
9137 break;
9138 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00009139 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009140 mortal_size += s->length;
9141 break;
9142 default:
9143 Py_FatalError("Inconsistent interned string state.");
9144 }
9145 s->state = SSTATE_NOT_INTERNED;
9146 }
9147 fprintf(stderr, "total size of all interned strings: "
9148 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9149 "mortal/immortal\n", mortal_size, immortal_size);
9150 Py_DECREF(keys);
9151 PyDict_Clear(interned);
9152 Py_DECREF(interned);
9153 interned = NULL;
9154}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009155
9156
9157/********************* Unicode Iterator **************************/
9158
9159typedef struct {
9160 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009161 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009162 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9163} unicodeiterobject;
9164
9165static void
9166unicodeiter_dealloc(unicodeiterobject *it)
9167{
9168 _PyObject_GC_UNTRACK(it);
9169 Py_XDECREF(it->it_seq);
9170 PyObject_GC_Del(it);
9171}
9172
9173static int
9174unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9175{
9176 Py_VISIT(it->it_seq);
9177 return 0;
9178}
9179
9180static PyObject *
9181unicodeiter_next(unicodeiterobject *it)
9182{
9183 PyUnicodeObject *seq;
9184 PyObject *item;
9185
9186 assert(it != NULL);
9187 seq = it->it_seq;
9188 if (seq == NULL)
9189 return NULL;
9190 assert(PyUnicode_Check(seq));
9191
9192 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009193 item = PyUnicode_FromUnicode(
9194 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009195 if (item != NULL)
9196 ++it->it_index;
9197 return item;
9198 }
9199
9200 Py_DECREF(seq);
9201 it->it_seq = NULL;
9202 return NULL;
9203}
9204
9205static PyObject *
9206unicodeiter_len(unicodeiterobject *it)
9207{
9208 Py_ssize_t len = 0;
9209 if (it->it_seq)
9210 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9211 return PyInt_FromSsize_t(len);
9212}
9213
9214PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9215
9216static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009217 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9218 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009219 {NULL, NULL} /* sentinel */
9220};
9221
9222PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009223 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009224 "unicodeiterator", /* tp_name */
9225 sizeof(unicodeiterobject), /* tp_basicsize */
9226 0, /* tp_itemsize */
9227 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009228 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009229 0, /* tp_print */
9230 0, /* tp_getattr */
9231 0, /* tp_setattr */
9232 0, /* tp_compare */
9233 0, /* tp_repr */
9234 0, /* tp_as_number */
9235 0, /* tp_as_sequence */
9236 0, /* tp_as_mapping */
9237 0, /* tp_hash */
9238 0, /* tp_call */
9239 0, /* tp_str */
9240 PyObject_GenericGetAttr, /* tp_getattro */
9241 0, /* tp_setattro */
9242 0, /* tp_as_buffer */
9243 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9244 0, /* tp_doc */
9245 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9246 0, /* tp_clear */
9247 0, /* tp_richcompare */
9248 0, /* tp_weaklistoffset */
9249 PyObject_SelfIter, /* tp_iter */
9250 (iternextfunc)unicodeiter_next, /* tp_iternext */
9251 unicodeiter_methods, /* tp_methods */
9252 0,
9253};
9254
9255static PyObject *
9256unicode_iter(PyObject *seq)
9257{
9258 unicodeiterobject *it;
9259
9260 if (!PyUnicode_Check(seq)) {
9261 PyErr_BadInternalCall();
9262 return NULL;
9263 }
9264 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9265 if (it == NULL)
9266 return NULL;
9267 it->it_index = 0;
9268 Py_INCREF(seq);
9269 it->it_seq = (PyUnicodeObject *)seq;
9270 _PyObject_GC_TRACK(it);
9271 return (PyObject *)it;
9272}
9273
Martin v. Löwis5b222132007-06-10 09:51:05 +00009274size_t
9275Py_UNICODE_strlen(const Py_UNICODE *u)
9276{
9277 int res = 0;
9278 while(*u++)
9279 res++;
9280 return res;
9281}
9282
9283Py_UNICODE*
9284Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9285{
9286 Py_UNICODE *u = s1;
9287 while ((*u++ = *s2++));
9288 return s1;
9289}
9290
9291Py_UNICODE*
9292Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9293{
9294 Py_UNICODE *u = s1;
9295 while ((*u++ = *s2++))
9296 if (n-- == 0)
9297 break;
9298 return s1;
9299}
9300
9301int
9302Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9303{
9304 while (*s1 && *s2 && *s1 == *s2)
9305 s1++, s2++;
9306 if (*s1 && *s2)
9307 return (*s1 < *s2) ? -1 : +1;
9308 if (*s1)
9309 return 1;
9310 if (*s2)
9311 return -1;
9312 return 0;
9313}
9314
9315Py_UNICODE*
9316Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9317{
9318 const Py_UNICODE *p;
9319 for (p = s; *p; p++)
9320 if (*p == c)
9321 return (Py_UNICODE*)p;
9322 return NULL;
9323}
9324
9325
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009326#ifdef __cplusplus
9327}
9328#endif
9329
9330
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009331/*
9332Local variables:
9333c-basic-offset: 4
9334indent-tabs-mode: nil
9335End:
9336*/