blob: aed07ee2b8ec8db8c5f47e2cbca9e1e63d3a9522 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000107 parameter; it is fixed to "utf-8". Always use the
108 PyUnicode_GetDefaultEncoding() API to access this global. */
109static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000110
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000111Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000112PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000113{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000114#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115 return 0x10FFFF;
116#else
117 /* This is actually an illegal character, so it should
118 not be passed to unichr. */
119 return 0xFFFF;
120#endif
121}
122
Thomas Wouters477c8d52006-05-27 19:21:47 +0000123/* --- Bloom Filters ----------------------------------------------------- */
124
125/* stuff to implement simple "bloom filters" for Unicode characters.
126 to keep things simple, we use a single bitmask, using the least 5
127 bits from each unicode characters as the bit index. */
128
129/* the linebreak mask is set up by Unicode_Init below */
130
131#define BLOOM_MASK unsigned long
132
133static BLOOM_MASK bloom_linebreak;
134
135#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
136
137#define BLOOM_LINEBREAK(ch)\
138 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
139
140Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
141{
142 /* calculate simple bloom-style bitmask for a given unicode string */
143
144 long mask;
145 Py_ssize_t i;
146
147 mask = 0;
148 for (i = 0; i < len; i++)
149 mask |= (1 << (ptr[i] & 0x1F));
150
151 return mask;
152}
153
154Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
155{
156 Py_ssize_t i;
157
158 for (i = 0; i < setlen; i++)
159 if (set[i] == chr)
160 return 1;
161
162 return 0;
163}
164
165#define BLOOM_MEMBER(mask, chr, set, setlen)\
166 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168/* --- Unicode Object ----------------------------------------------------- */
169
170static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000172 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173{
174 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000175
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000176 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180 /* Resizing shared object (unicode_empty or single character
181 objects) in-place is not allowed. Use PyUnicode_Resize()
182 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184 if (unicode == unicode_empty ||
185 (unicode->length == 1 &&
186 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000189 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 return -1;
191 }
192
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 /* We allocate one more byte to make sure the string is Ux0000 terminated.
194 The overallocation is also used by fastsearch, which assumes that it's
195 safe to look at str[length] (without making any assumptions about what
196 it contains). */
197
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 oldstr = unicode->str;
199 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
200 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 PyErr_NoMemory();
203 return -1;
204 }
205 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000208 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000210 if (unicode->defenc) {
211 Py_DECREF(unicode->defenc);
212 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 }
214 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000215
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 return 0;
217}
218
219/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000220 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221
222 XXX This allocator could further be enhanced by assuring that the
223 free list never reduces its size below 1.
224
225*/
226
227static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000228PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229{
230 register PyUnicodeObject *unicode;
231
Thomas Wouters477c8d52006-05-27 19:21:47 +0000232 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (length == 0 && unicode_empty != NULL) {
234 Py_INCREF(unicode_empty);
235 return unicode_empty;
236 }
237
238 /* Unicode freelist & memory allocation */
239 if (unicode_freelist) {
240 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000241 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization: we only upsize the buffer,
245 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000246 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000247 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000249 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000252 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000254 }
255 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 }
257 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000258 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 if (unicode == NULL)
260 return NULL;
261 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
262 }
263
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000264 if (!unicode->str) {
265 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000266 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000267 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000269 * the caller fails before initializing str -- unicode_resize()
270 * reads str[0], and the Keep-Alive optimization can keep memory
271 * allocated for str alive across a call to unicode_dealloc(unicode).
272 * We don't want unicode_resize to read uninitialized memory in
273 * that case.
274 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000275 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000279 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000281
282 onError:
283 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000284 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286}
287
288static
Guido van Rossum9475a232001-10-05 20:51:39 +0000289void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000291 if (PyUnicode_CheckExact(unicode) &&
292 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000293 /* Keep-Alive optimization */
294 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000295 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 unicode->str = NULL;
297 unicode->length = 0;
298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
300 Py_DECREF(unicode->defenc);
301 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 }
303 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 *(PyUnicodeObject **)unicode = unicode_freelist;
305 unicode_freelist = unicode;
306 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 }
308 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000309 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000311 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313}
314
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316{
317 register PyUnicodeObject *v;
318
319 /* Argument checks */
320 if (unicode == NULL) {
321 PyErr_BadInternalCall();
322 return -1;
323 }
324 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000325 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 PyErr_BadInternalCall();
327 return -1;
328 }
329
330 /* Resizing unicode_empty and single character objects is not
331 possible since these are being shared. We simply return a fresh
332 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000333 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 (v == unicode_empty || v->length == 1)) {
335 PyUnicodeObject *w = _PyUnicode_New(length);
336 if (w == NULL)
337 return -1;
338 Py_UNICODE_COPY(w->str, v->str,
339 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000340 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 *unicode = (PyObject *)w;
342 return 0;
343 }
344
345 /* Note that we don't have to modify *unicode for unshared Unicode
346 objects, since we can modify them in-place. */
347 return unicode_resize(v, length);
348}
349
350/* Internal API for use in unicodeobject.c only ! */
351#define _PyUnicode_Resize(unicodevar, length) \
352 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
353
Guido van Rossumd57fd912000-03-10 22:53:23 +0000354PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000355 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356{
357 PyUnicodeObject *unicode;
358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000359 /* If the Unicode data is known at construction time, we can apply
360 some optimizations which share commonly used objects. */
361 if (u != NULL) {
362
363 /* Optimization for empty strings */
364 if (size == 0 && unicode_empty != NULL) {
365 Py_INCREF(unicode_empty);
366 return (PyObject *)unicode_empty;
367 }
368
369 /* Single character Unicode objects in the Latin-1 range are
370 shared when using this constructor */
371 if (size == 1 && *u < 256) {
372 unicode = unicode_latin1[*u];
373 if (!unicode) {
374 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000375 if (!unicode)
376 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000377 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000378 unicode_latin1[*u] = unicode;
379 }
380 Py_INCREF(unicode);
381 return (PyObject *)unicode;
382 }
383 }
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 unicode = _PyUnicode_New(size);
386 if (!unicode)
387 return NULL;
388
389 /* Copy the Unicode data into the new object */
390 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392
393 return (PyObject *)unicode;
394}
395
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000396PyObject *PyUnicode_FromString(const char *u)
397{
398 PyUnicodeObject *unicode;
399 Py_ssize_t size = strlen(u);
400
401 /* If the Unicode data is known at construction time, we can apply
402 some optimizations which share commonly used objects. */
403 if (u != NULL) {
404
405 /* Optimization for empty strings */
406 if (size == 0 && unicode_empty != NULL) {
407 Py_INCREF(unicode_empty);
408 return (PyObject *)unicode_empty;
409 }
410
Walter Dörwald071b9da2007-05-05 14:21:20 +0000411 /* Single characters are shared when using this constructor */
412 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000413 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000414 if (!unicode) {
415 unicode = _PyUnicode_New(1);
416 if (!unicode)
417 return NULL;
418 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000419 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000420 }
421 Py_INCREF(unicode);
422 return (PyObject *)unicode;
423 }
424 }
425
426 unicode = _PyUnicode_New(size);
427 if (!unicode)
428 return NULL;
429
430 /* Copy the Unicode data into the new object */
431 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000432 Py_UNICODE *p = unicode->str;
433 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000434 ;
435 }
436
437 return (PyObject *)unicode;
438}
439
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440#ifdef HAVE_WCHAR_H
441
442PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000443 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444{
445 PyUnicodeObject *unicode;
446
447 if (w == NULL) {
448 PyErr_BadInternalCall();
449 return NULL;
450 }
451
452 unicode = _PyUnicode_New(size);
453 if (!unicode)
454 return NULL;
455
456 /* Copy the wchar_t data into the new object */
457#ifdef HAVE_USABLE_WCHAR_T
458 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000459#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 {
461 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000462 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000463 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000464 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000465 *u++ = *w++;
466 }
467#endif
468
469 return (PyObject *)unicode;
470}
471
Martin v. Löwis18e16552006-02-15 17:27:45 +0000472Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
473 wchar_t *w,
474 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000475{
476 if (unicode == NULL) {
477 PyErr_BadInternalCall();
478 return -1;
479 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000480
481 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000483 size = PyUnicode_GET_SIZE(unicode) + 1;
484
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485#ifdef HAVE_USABLE_WCHAR_T
486 memcpy(w, unicode->str, size * sizeof(wchar_t));
487#else
488 {
489 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000490 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000492 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493 *w++ = *u++;
494 }
495#endif
496
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000497 if (size > PyUnicode_GET_SIZE(unicode))
498 return PyUnicode_GET_SIZE(unicode);
499 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 return size;
501}
502
503#endif
504
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000505PyObject *PyUnicode_FromOrdinal(int ordinal)
506{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000507 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000508
509#ifdef Py_UNICODE_WIDE
510 if (ordinal < 0 || ordinal > 0x10ffff) {
511 PyErr_SetString(PyExc_ValueError,
512 "unichr() arg not in range(0x110000) "
513 "(wide Python build)");
514 return NULL;
515 }
516#else
517 if (ordinal < 0 || ordinal > 0xffff) {
518 PyErr_SetString(PyExc_ValueError,
519 "unichr() arg not in range(0x10000) "
520 "(narrow Python build)");
521 return NULL;
522 }
523#endif
524
Hye-Shik Chang40574832004-04-06 07:24:51 +0000525 s[0] = (Py_UNICODE)ordinal;
526 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000527}
528
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529PyObject *PyUnicode_FromObject(register PyObject *obj)
530{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000531 /* XXX Perhaps we should make this API an alias of
532 PyObject_Unicode() instead ?! */
533 if (PyUnicode_CheckExact(obj)) {
534 Py_INCREF(obj);
535 return obj;
536 }
537 if (PyUnicode_Check(obj)) {
538 /* For a Unicode subtype that's not a Unicode object,
539 return a true Unicode object with the same data. */
540 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
541 PyUnicode_GET_SIZE(obj));
542 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000543 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
544}
545
546PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
547 const char *encoding,
548 const char *errors)
549{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000550 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000551 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000552 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000553
Guido van Rossumd57fd912000-03-10 22:53:23 +0000554 if (obj == NULL) {
555 PyErr_BadInternalCall();
556 return NULL;
557 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000558
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000559#if 0
560 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000561 that no encodings is given and then redirect to
562 PyObject_Unicode() which then applies the additional logic for
563 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000564
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000565 NOTE: This API should really only be used for object which
566 represent *encoded* Unicode !
567
568 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000569 if (PyUnicode_Check(obj)) {
570 if (encoding) {
571 PyErr_SetString(PyExc_TypeError,
572 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000573 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000574 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000575 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000576 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000577#else
578 if (PyUnicode_Check(obj)) {
579 PyErr_SetString(PyExc_TypeError,
580 "decoding Unicode is not supported");
581 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000582 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000583#endif
584
585 /* Coerce object */
586 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000587 s = PyString_AS_STRING(obj);
588 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000589 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000590 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
591 /* Overwrite the error message with something more useful in
592 case of a TypeError. */
593 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000594 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000595 "coercing to Unicode: need string or buffer, "
596 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000597 obj->ob_type->tp_name);
598 goto onError;
599 }
Tim Petersced69f82003-09-16 20:30:58 +0000600
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000601 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000602 if (len == 0) {
603 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000604 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605 }
Tim Petersced69f82003-09-16 20:30:58 +0000606 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000607 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000608
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000609 return v;
610
611 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000613}
614
615PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000616 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617 const char *encoding,
618 const char *errors)
619{
620 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000621
622 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000623 encoding = PyUnicode_GetDefaultEncoding();
624
625 /* Shortcuts for common default encodings */
626 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000627 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000628 else if (strcmp(encoding, "latin-1") == 0)
629 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000630#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
631 else if (strcmp(encoding, "mbcs") == 0)
632 return PyUnicode_DecodeMBCS(s, size, errors);
633#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000634 else if (strcmp(encoding, "ascii") == 0)
635 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636
637 /* Decode via the codec registry */
638 buffer = PyBuffer_FromMemory((void *)s, size);
639 if (buffer == NULL)
640 goto onError;
641 unicode = PyCodec_Decode(buffer, encoding, errors);
642 if (unicode == NULL)
643 goto onError;
644 if (!PyUnicode_Check(unicode)) {
645 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000646 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000647 unicode->ob_type->tp_name);
648 Py_DECREF(unicode);
649 goto onError;
650 }
651 Py_DECREF(buffer);
652 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000653
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 onError:
655 Py_XDECREF(buffer);
656 return NULL;
657}
658
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000659PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
660 const char *encoding,
661 const char *errors)
662{
663 PyObject *v;
664
665 if (!PyUnicode_Check(unicode)) {
666 PyErr_BadArgument();
667 goto onError;
668 }
669
670 if (encoding == NULL)
671 encoding = PyUnicode_GetDefaultEncoding();
672
673 /* Decode via the codec registry */
674 v = PyCodec_Decode(unicode, encoding, errors);
675 if (v == NULL)
676 goto onError;
677 return v;
678
679 onError:
680 return NULL;
681}
682
Guido van Rossumd57fd912000-03-10 22:53:23 +0000683PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000684 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000685 const char *encoding,
686 const char *errors)
687{
688 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000689
Guido van Rossumd57fd912000-03-10 22:53:23 +0000690 unicode = PyUnicode_FromUnicode(s, size);
691 if (unicode == NULL)
692 return NULL;
693 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
694 Py_DECREF(unicode);
695 return v;
696}
697
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000698PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
699 const char *encoding,
700 const char *errors)
701{
702 PyObject *v;
703
704 if (!PyUnicode_Check(unicode)) {
705 PyErr_BadArgument();
706 goto onError;
707 }
708
709 if (encoding == NULL)
710 encoding = PyUnicode_GetDefaultEncoding();
711
712 /* Encode via the codec registry */
713 v = PyCodec_Encode(unicode, encoding, errors);
714 if (v == NULL)
715 goto onError;
716 return v;
717
718 onError:
719 return NULL;
720}
721
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
723 const char *encoding,
724 const char *errors)
725{
726 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000727
Guido van Rossumd57fd912000-03-10 22:53:23 +0000728 if (!PyUnicode_Check(unicode)) {
729 PyErr_BadArgument();
730 goto onError;
731 }
Fred Drakee4315f52000-05-09 19:53:39 +0000732
Tim Petersced69f82003-09-16 20:30:58 +0000733 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000734 encoding = PyUnicode_GetDefaultEncoding();
735
736 /* Shortcuts for common default encodings */
737 if (errors == NULL) {
738 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000739 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000740 else if (strcmp(encoding, "latin-1") == 0)
741 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000742#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
743 else if (strcmp(encoding, "mbcs") == 0)
744 return PyUnicode_AsMBCSString(unicode);
745#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000746 else if (strcmp(encoding, "ascii") == 0)
747 return PyUnicode_AsASCIIString(unicode);
748 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000749
750 /* Encode via the codec registry */
751 v = PyCodec_Encode(unicode, encoding, errors);
752 if (v == NULL)
753 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000754 if (!PyBytes_Check(v)) {
755 if (PyString_Check(v)) {
756 /* Old codec, turn it into bytes */
757 PyObject *b = PyBytes_FromObject(v);
758 Py_DECREF(v);
759 return b;
760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000761 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000762 "encoder did not return a bytes object "
763 "(type=%.400s, encoding=%.20s, errors=%.20s)",
764 v->ob_type->tp_name,
765 encoding ? encoding : "NULL",
766 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000767 Py_DECREF(v);
768 goto onError;
769 }
770 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000771
Guido van Rossumd57fd912000-03-10 22:53:23 +0000772 onError:
773 return NULL;
774}
775
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000776PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
777 const char *errors)
778{
779 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000780 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000781 if (v)
782 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000783 if (errors != NULL)
784 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
785 if (errors == NULL) {
786 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
787 PyUnicode_GET_SIZE(unicode),
788 NULL);
789 }
790 else {
791 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
792 }
793 if (!b)
794 return NULL;
795 v = PyString_FromStringAndSize(PyBytes_AsString(b),
796 PyBytes_Size(b));
797 Py_DECREF(b);
798 if (!errors) {
799 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000800 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000801 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000802 return v;
803}
804
Guido van Rossumd57fd912000-03-10 22:53:23 +0000805Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
806{
807 if (!PyUnicode_Check(unicode)) {
808 PyErr_BadArgument();
809 goto onError;
810 }
811 return PyUnicode_AS_UNICODE(unicode);
812
813 onError:
814 return NULL;
815}
816
Martin v. Löwis18e16552006-02-15 17:27:45 +0000817Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000818{
819 if (!PyUnicode_Check(unicode)) {
820 PyErr_BadArgument();
821 goto onError;
822 }
823 return PyUnicode_GET_SIZE(unicode);
824
825 onError:
826 return -1;
827}
828
Thomas Wouters78890102000-07-22 19:25:51 +0000829const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000830{
831 return unicode_default_encoding;
832}
833
834int PyUnicode_SetDefaultEncoding(const char *encoding)
835{
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000836 if (strcmp(encoding, unicode_default_encoding) != 0) {
837 PyErr_Format(PyExc_ValueError,
838 "Can only set default encoding to %s",
839 unicode_default_encoding);
840 return -1;
841 }
Fred Drakee4315f52000-05-09 19:53:39 +0000842 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +0000843}
844
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000845/* error handling callback helper:
846 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000847 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848 and adjust various state variables.
849 return 0 on success, -1 on error
850*/
851
852static
853int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
854 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000855 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
856 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000857{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000858 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000859
860 PyObject *restuple = NULL;
861 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000862 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
863 Py_ssize_t requiredsize;
864 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000865 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000866 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000867 int res = -1;
868
869 if (*errorHandler == NULL) {
870 *errorHandler = PyCodec_LookupError(errors);
871 if (*errorHandler == NULL)
872 goto onError;
873 }
874
875 if (*exceptionObject == NULL) {
876 *exceptionObject = PyUnicodeDecodeError_Create(
877 encoding, input, insize, *startinpos, *endinpos, reason);
878 if (*exceptionObject == NULL)
879 goto onError;
880 }
881 else {
882 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
883 goto onError;
884 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
885 goto onError;
886 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
887 goto onError;
888 }
889
890 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
891 if (restuple == NULL)
892 goto onError;
893 if (!PyTuple_Check(restuple)) {
894 PyErr_Format(PyExc_TypeError, &argparse[4]);
895 goto onError;
896 }
897 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
898 goto onError;
899 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000900 newpos = insize+newpos;
901 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000902 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000903 goto onError;
904 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000905
906 /* need more space? (at least enough for what we
907 have+the replacement+the rest of the string (starting
908 at the new input position), so we won't have to check space
909 when there are no errors in the rest of the string) */
910 repptr = PyUnicode_AS_UNICODE(repunicode);
911 repsize = PyUnicode_GET_SIZE(repunicode);
912 requiredsize = *outpos + repsize + insize-newpos;
913 if (requiredsize > outsize) {
914 if (requiredsize<2*outsize)
915 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000916 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000917 goto onError;
918 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
919 }
920 *endinpos = newpos;
921 *inptr = input + newpos;
922 Py_UNICODE_COPY(*outptr, repptr, repsize);
923 *outptr += repsize;
924 *outpos += repsize;
925 /* we made it! */
926 res = 0;
927
928 onError:
929 Py_XDECREF(restuple);
930 return res;
931}
932
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000933/* --- UTF-7 Codec -------------------------------------------------------- */
934
935/* see RFC2152 for details */
936
Tim Petersced69f82003-09-16 20:30:58 +0000937static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000938char utf7_special[128] = {
939 /* indicate whether a UTF-7 character is special i.e. cannot be directly
940 encoded:
941 0 - not special
942 1 - special
943 2 - whitespace (optional)
944 3 - RFC2152 Set O (optional) */
945 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
946 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
947 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
948 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
949 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
950 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
951 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
952 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
953
954};
955
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000956/* Note: The comparison (c) <= 0 is a trick to work-around gcc
957 warnings about the comparison always being false; since
958 utf7_special[0] is 1, we can safely make that one comparison
959 true */
960
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000961#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000962 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000963 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000964 (encodeO && (utf7_special[(c)] == 3)))
965
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000966#define B64(n) \
967 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
968#define B64CHAR(c) \
969 (isalnum(c) || (c) == '+' || (c) == '/')
970#define UB64(c) \
971 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
972 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000973
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000974#define ENCODE(out, ch, bits) \
975 while (bits >= 6) { \
976 *out++ = B64(ch >> (bits-6)); \
977 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000978 }
979
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000980#define DECODE(out, ch, bits, surrogate) \
981 while (bits >= 16) { \
982 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
983 bits -= 16; \
984 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000985 /* We have already generated an error for the high surrogate \
986 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000987 surrogate = 0; \
988 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000989 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000990 it in a 16-bit character */ \
991 surrogate = 1; \
992 errmsg = "code pairs are not supported"; \
993 goto utf7Error; \
994 } else { \
995 *out++ = outCh; \
996 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000997 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000998
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000999PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001000 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001001 const char *errors)
1002{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001003 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001004 Py_ssize_t startinpos;
1005 Py_ssize_t endinpos;
1006 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 const char *e;
1008 PyUnicodeObject *unicode;
1009 Py_UNICODE *p;
1010 const char *errmsg = "";
1011 int inShift = 0;
1012 unsigned int bitsleft = 0;
1013 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001014 int surrogate = 0;
1015 PyObject *errorHandler = NULL;
1016 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001017
1018 unicode = _PyUnicode_New(size);
1019 if (!unicode)
1020 return NULL;
1021 if (size == 0)
1022 return (PyObject *)unicode;
1023
1024 p = unicode->str;
1025 e = s + size;
1026
1027 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001028 Py_UNICODE ch;
1029 restart:
1030 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001031
1032 if (inShift) {
1033 if ((ch == '-') || !B64CHAR(ch)) {
1034 inShift = 0;
1035 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001036
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001037 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1038 if (bitsleft >= 6) {
1039 /* The shift sequence has a partial character in it. If
1040 bitsleft < 6 then we could just classify it as padding
1041 but that is not the case here */
1042
1043 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001044 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001045 }
1046 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001047 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 here so indicate the potential of a misencoded character. */
1049
1050 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1051 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1052 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001053 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001054 }
1055
1056 if (ch == '-') {
1057 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001058 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001059 inShift = 1;
1060 }
1061 } else if (SPECIAL(ch,0,0)) {
1062 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001063 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064 } else {
1065 *p++ = ch;
1066 }
1067 } else {
1068 charsleft = (charsleft << 6) | UB64(ch);
1069 bitsleft += 6;
1070 s++;
1071 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1072 }
1073 }
1074 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001075 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001076 s++;
1077 if (s < e && *s == '-') {
1078 s++;
1079 *p++ = '+';
1080 } else
1081 {
1082 inShift = 1;
1083 bitsleft = 0;
1084 }
1085 }
1086 else if (SPECIAL(ch,0,0)) {
1087 errmsg = "unexpected special character";
1088 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001089 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001090 }
1091 else {
1092 *p++ = ch;
1093 s++;
1094 }
1095 continue;
1096 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001097 outpos = p-PyUnicode_AS_UNICODE(unicode);
1098 endinpos = s-starts;
1099 if (unicode_decode_call_errorhandler(
1100 errors, &errorHandler,
1101 "utf7", errmsg,
1102 starts, size, &startinpos, &endinpos, &exc, &s,
1103 (PyObject **)&unicode, &outpos, &p))
1104 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001105 }
1106
1107 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001108 outpos = p-PyUnicode_AS_UNICODE(unicode);
1109 endinpos = size;
1110 if (unicode_decode_call_errorhandler(
1111 errors, &errorHandler,
1112 "utf7", "unterminated shift sequence",
1113 starts, size, &startinpos, &endinpos, &exc, &s,
1114 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001115 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001116 if (s < e)
1117 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001118 }
1119
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001120 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001121 goto onError;
1122
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001123 Py_XDECREF(errorHandler);
1124 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001125 return (PyObject *)unicode;
1126
1127onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001128 Py_XDECREF(errorHandler);
1129 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001130 Py_DECREF(unicode);
1131 return NULL;
1132}
1133
1134
1135PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001136 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001137 int encodeSetO,
1138 int encodeWhiteSpace,
1139 const char *errors)
1140{
1141 PyObject *v;
1142 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001143 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001144 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001145 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001146 unsigned int bitsleft = 0;
1147 unsigned long charsleft = 0;
1148 char * out;
1149 char * start;
1150
1151 if (size == 0)
1152 return PyString_FromStringAndSize(NULL, 0);
1153
1154 v = PyString_FromStringAndSize(NULL, cbAllocated);
1155 if (v == NULL)
1156 return NULL;
1157
1158 start = out = PyString_AS_STRING(v);
1159 for (;i < size; ++i) {
1160 Py_UNICODE ch = s[i];
1161
1162 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001163 if (ch == '+') {
1164 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001165 *out++ = '-';
1166 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1167 charsleft = ch;
1168 bitsleft = 16;
1169 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001170 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001171 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001172 } else {
1173 *out++ = (char) ch;
1174 }
1175 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001176 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1177 *out++ = B64(charsleft << (6-bitsleft));
1178 charsleft = 0;
1179 bitsleft = 0;
1180 /* Characters not in the BASE64 set implicitly unshift the sequence
1181 so no '-' is required, except if the character is itself a '-' */
1182 if (B64CHAR(ch) || ch == '-') {
1183 *out++ = '-';
1184 }
1185 inShift = 0;
1186 *out++ = (char) ch;
1187 } else {
1188 bitsleft += 16;
1189 charsleft = (charsleft << 16) | ch;
1190 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1191
1192 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001193 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001194 or '-' then the shift sequence will be terminated implicitly and we
1195 don't have to insert a '-'. */
1196
1197 if (bitsleft == 0) {
1198 if (i + 1 < size) {
1199 Py_UNICODE ch2 = s[i+1];
1200
1201 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001202
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001203 } else if (B64CHAR(ch2) || ch2 == '-') {
1204 *out++ = '-';
1205 inShift = 0;
1206 } else {
1207 inShift = 0;
1208 }
1209
1210 }
1211 else {
1212 *out++ = '-';
1213 inShift = 0;
1214 }
1215 }
Tim Petersced69f82003-09-16 20:30:58 +00001216 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001217 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001218 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001219 if (bitsleft) {
1220 *out++= B64(charsleft << (6-bitsleft) );
1221 *out++ = '-';
1222 }
1223
Tim Peters5de98422002-04-27 18:44:32 +00001224 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001225 return v;
1226}
1227
1228#undef SPECIAL
1229#undef B64
1230#undef B64CHAR
1231#undef UB64
1232#undef ENCODE
1233#undef DECODE
1234
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235/* --- UTF-8 Codec -------------------------------------------------------- */
1236
Tim Petersced69f82003-09-16 20:30:58 +00001237static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238char utf8_code_length[256] = {
1239 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1240 illegal prefix. see RFC 2279 for details */
1241 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1242 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1243 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1244 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1245 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1246 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1247 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1248 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1249 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1250 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1251 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1252 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1253 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1254 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1255 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1256 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1257};
1258
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001260 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001261 const char *errors)
1262{
Walter Dörwald69652032004-09-07 20:24:22 +00001263 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1264}
1265
1266PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001267 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001268 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001269 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001270{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001271 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001273 Py_ssize_t startinpos;
1274 Py_ssize_t endinpos;
1275 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 const char *e;
1277 PyUnicodeObject *unicode;
1278 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001279 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 PyObject *errorHandler = NULL;
1281 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001282
1283 /* Note: size will always be longer than the resulting Unicode
1284 character count */
1285 unicode = _PyUnicode_New(size);
1286 if (!unicode)
1287 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001288 if (size == 0) {
1289 if (consumed)
1290 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293
1294 /* Unpack UTF-8 encoded data */
1295 p = unicode->str;
1296 e = s + size;
1297
1298 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001299 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300
1301 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001302 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 s++;
1304 continue;
1305 }
1306
1307 n = utf8_code_length[ch];
1308
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001309 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001310 if (consumed)
1311 break;
1312 else {
1313 errmsg = "unexpected end of data";
1314 startinpos = s-starts;
1315 endinpos = size;
1316 goto utf8Error;
1317 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319
1320 switch (n) {
1321
1322 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001323 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001324 startinpos = s-starts;
1325 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001326 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327
1328 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001329 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001330 startinpos = s-starts;
1331 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001332 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333
1334 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001335 if ((s[1] & 0xc0) != 0x80) {
1336 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 startinpos = s-starts;
1338 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001339 goto utf8Error;
1340 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001342 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001343 startinpos = s-starts;
1344 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001345 errmsg = "illegal encoding";
1346 goto utf8Error;
1347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001349 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350 break;
1351
1352 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001353 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001354 (s[2] & 0xc0) != 0x80) {
1355 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001356 startinpos = s-starts;
1357 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001358 goto utf8Error;
1359 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001360 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001361 if (ch < 0x0800) {
1362 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001363 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001364
1365 XXX For wide builds (UCS-4) we should probably try
1366 to recombine the surrogates into a single code
1367 unit.
1368 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001370 startinpos = s-starts;
1371 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001372 goto utf8Error;
1373 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001374 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001375 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001376 break;
1377
1378 case 4:
1379 if ((s[1] & 0xc0) != 0x80 ||
1380 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001381 (s[3] & 0xc0) != 0x80) {
1382 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001383 startinpos = s-starts;
1384 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001385 goto utf8Error;
1386 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001387 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1388 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1389 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001390 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001391 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001392 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001393 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001394 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001395 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001396 startinpos = s-starts;
1397 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001398 goto utf8Error;
1399 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001400#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001401 *p++ = (Py_UNICODE)ch;
1402#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001403 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001404
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001405 /* translate from 10000..10FFFF to 0..FFFF */
1406 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001407
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001408 /* high surrogate = top 10 bits added to D800 */
1409 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001410
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001411 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001412 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001413#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414 break;
1415
1416 default:
1417 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001418 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001419 startinpos = s-starts;
1420 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001421 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422 }
1423 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001424 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001425
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001426 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001427 outpos = p-PyUnicode_AS_UNICODE(unicode);
1428 if (unicode_decode_call_errorhandler(
1429 errors, &errorHandler,
1430 "utf8", errmsg,
1431 starts, size, &startinpos, &endinpos, &exc, &s,
1432 (PyObject **)&unicode, &outpos, &p))
1433 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001434 }
Walter Dörwald69652032004-09-07 20:24:22 +00001435 if (consumed)
1436 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437
1438 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001439 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440 goto onError;
1441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001442 Py_XDECREF(errorHandler);
1443 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444 return (PyObject *)unicode;
1445
1446onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001447 Py_XDECREF(errorHandler);
1448 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001449 Py_DECREF(unicode);
1450 return NULL;
1451}
1452
Tim Peters602f7402002-04-27 18:03:26 +00001453/* Allocation strategy: if the string is short, convert into a stack buffer
1454 and allocate exactly as much space needed at the end. Else allocate the
1455 maximum possible needed (4 result bytes per Unicode character), and return
1456 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001457*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001458PyObject *
1459PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001460 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001461 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462{
Tim Peters602f7402002-04-27 18:03:26 +00001463#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001464
Martin v. Löwis18e16552006-02-15 17:27:45 +00001465 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001466 PyObject *v; /* result string object */
1467 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001468 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001469 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001470 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001471
Tim Peters602f7402002-04-27 18:03:26 +00001472 assert(s != NULL);
1473 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474
Tim Peters602f7402002-04-27 18:03:26 +00001475 if (size <= MAX_SHORT_UNICHARS) {
1476 /* Write into the stack buffer; nallocated can't overflow.
1477 * At the end, we'll allocate exactly as much heap space as it
1478 * turns out we need.
1479 */
1480 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1481 v = NULL; /* will allocate after we're done */
1482 p = stackbuf;
1483 }
1484 else {
1485 /* Overallocate on the heap, and give the excess back at the end. */
1486 nallocated = size * 4;
1487 if (nallocated / 4 != size) /* overflow! */
1488 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001489 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001490 if (v == NULL)
1491 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001492 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001493 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001494
Tim Peters602f7402002-04-27 18:03:26 +00001495 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001496 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001497
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001498 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001499 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001501
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001503 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001504 *p++ = (char)(0xc0 | (ch >> 6));
1505 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001506 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001507 else {
Tim Peters602f7402002-04-27 18:03:26 +00001508 /* Encode UCS2 Unicode ordinals */
1509 if (ch < 0x10000) {
1510 /* Special case: check for high surrogate */
1511 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1512 Py_UCS4 ch2 = s[i];
1513 /* Check for low surrogate and combine the two to
1514 form a UCS4 value */
1515 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001516 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001517 i++;
1518 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001519 }
Tim Peters602f7402002-04-27 18:03:26 +00001520 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001521 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001522 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001523 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1524 *p++ = (char)(0x80 | (ch & 0x3f));
1525 continue;
1526 }
1527encodeUCS4:
1528 /* Encode UCS4 Unicode ordinals */
1529 *p++ = (char)(0xf0 | (ch >> 18));
1530 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1531 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1532 *p++ = (char)(0x80 | (ch & 0x3f));
1533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001535
Tim Peters602f7402002-04-27 18:03:26 +00001536 if (v == NULL) {
1537 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001538 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001539 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001540 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001541 }
1542 else {
1543 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001544 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001545 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001546 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001549
Tim Peters602f7402002-04-27 18:03:26 +00001550#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001551}
1552
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1554{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 if (!PyUnicode_Check(unicode)) {
1556 PyErr_BadArgument();
1557 return NULL;
1558 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001559 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1560 PyUnicode_GET_SIZE(unicode),
1561 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562}
1563
1564/* --- UTF-16 Codec ------------------------------------------------------- */
1565
Tim Peters772747b2001-08-09 22:21:55 +00001566PyObject *
1567PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001568 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001569 const char *errors,
1570 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571{
Walter Dörwald69652032004-09-07 20:24:22 +00001572 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1573}
1574
1575PyObject *
1576PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001577 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001578 const char *errors,
1579 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001580 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001581{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001582 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001583 Py_ssize_t startinpos;
1584 Py_ssize_t endinpos;
1585 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586 PyUnicodeObject *unicode;
1587 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001588 const unsigned char *q, *e;
1589 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001590 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001591 /* Offsets from q for retrieving byte pairs in the right order. */
1592#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1593 int ihi = 1, ilo = 0;
1594#else
1595 int ihi = 0, ilo = 1;
1596#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001597 PyObject *errorHandler = NULL;
1598 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599
1600 /* Note: size will always be longer than the resulting Unicode
1601 character count */
1602 unicode = _PyUnicode_New(size);
1603 if (!unicode)
1604 return NULL;
1605 if (size == 0)
1606 return (PyObject *)unicode;
1607
1608 /* Unpack UTF-16 encoded data */
1609 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001610 q = (unsigned char *)s;
1611 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612
1613 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001614 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001615
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001616 /* Check for BOM marks (U+FEFF) in the input and adjust current
1617 byte order setting accordingly. In native mode, the leading BOM
1618 mark is skipped, in all other modes, it is copied to the output
1619 stream as-is (giving a ZWNBSP character). */
1620 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001621 if (size >= 2) {
1622 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001623#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001624 if (bom == 0xFEFF) {
1625 q += 2;
1626 bo = -1;
1627 }
1628 else if (bom == 0xFFFE) {
1629 q += 2;
1630 bo = 1;
1631 }
Tim Petersced69f82003-09-16 20:30:58 +00001632#else
Walter Dörwald69652032004-09-07 20:24:22 +00001633 if (bom == 0xFEFF) {
1634 q += 2;
1635 bo = 1;
1636 }
1637 else if (bom == 0xFFFE) {
1638 q += 2;
1639 bo = -1;
1640 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001641#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001642 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644
Tim Peters772747b2001-08-09 22:21:55 +00001645 if (bo == -1) {
1646 /* force LE */
1647 ihi = 1;
1648 ilo = 0;
1649 }
1650 else if (bo == 1) {
1651 /* force BE */
1652 ihi = 0;
1653 ilo = 1;
1654 }
1655
1656 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001657 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001658 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001659 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001660 if (consumed)
1661 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001662 errmsg = "truncated data";
1663 startinpos = ((const char *)q)-starts;
1664 endinpos = ((const char *)e)-starts;
1665 goto utf16Error;
1666 /* The remaining input chars are ignored if the callback
1667 chooses to skip the input */
1668 }
1669 ch = (q[ihi] << 8) | q[ilo];
1670
Tim Peters772747b2001-08-09 22:21:55 +00001671 q += 2;
1672
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 if (ch < 0xD800 || ch > 0xDFFF) {
1674 *p++ = ch;
1675 continue;
1676 }
1677
1678 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001679 if (q >= e) {
1680 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001681 startinpos = (((const char *)q)-2)-starts;
1682 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001683 goto utf16Error;
1684 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001685 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001686 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1687 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001688 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001689#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001690 *p++ = ch;
1691 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001692#else
1693 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001694#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001695 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001696 }
1697 else {
1698 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001699 startinpos = (((const char *)q)-4)-starts;
1700 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001701 goto utf16Error;
1702 }
1703
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001705 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 startinpos = (((const char *)q)-2)-starts;
1707 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001708 /* Fall through to report the error */
1709
1710 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001711 outpos = p-PyUnicode_AS_UNICODE(unicode);
1712 if (unicode_decode_call_errorhandler(
1713 errors, &errorHandler,
1714 "utf16", errmsg,
1715 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1716 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001717 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 }
1719
1720 if (byteorder)
1721 *byteorder = bo;
1722
Walter Dörwald69652032004-09-07 20:24:22 +00001723 if (consumed)
1724 *consumed = (const char *)q-starts;
1725
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001727 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728 goto onError;
1729
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001730 Py_XDECREF(errorHandler);
1731 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732 return (PyObject *)unicode;
1733
1734onError:
1735 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001736 Py_XDECREF(errorHandler);
1737 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 return NULL;
1739}
1740
Tim Peters772747b2001-08-09 22:21:55 +00001741PyObject *
1742PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001743 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001744 const char *errors,
1745 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746{
1747 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001748 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001749#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001750 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001751#else
1752 const int pairs = 0;
1753#endif
Tim Peters772747b2001-08-09 22:21:55 +00001754 /* Offsets from p for storing byte pairs in the right order. */
1755#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1756 int ihi = 1, ilo = 0;
1757#else
1758 int ihi = 0, ilo = 1;
1759#endif
1760
1761#define STORECHAR(CH) \
1762 do { \
1763 p[ihi] = ((CH) >> 8) & 0xff; \
1764 p[ilo] = (CH) & 0xff; \
1765 p += 2; \
1766 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001768#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001769 for (i = pairs = 0; i < size; i++)
1770 if (s[i] >= 0x10000)
1771 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001772#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00001773 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001774 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775 if (v == NULL)
1776 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777
Walter Dörwald3cc34522007-05-04 10:48:27 +00001778 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001780 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001781 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001782 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001783
1784 if (byteorder == -1) {
1785 /* force LE */
1786 ihi = 1;
1787 ilo = 0;
1788 }
1789 else if (byteorder == 1) {
1790 /* force BE */
1791 ihi = 0;
1792 ilo = 1;
1793 }
1794
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001795 while (size-- > 0) {
1796 Py_UNICODE ch = *s++;
1797 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001798#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001799 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001800 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1801 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001803#endif
Tim Peters772747b2001-08-09 22:21:55 +00001804 STORECHAR(ch);
1805 if (ch2)
1806 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001807 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001809#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810}
1811
1812PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1813{
1814 if (!PyUnicode_Check(unicode)) {
1815 PyErr_BadArgument();
1816 return NULL;
1817 }
1818 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1819 PyUnicode_GET_SIZE(unicode),
1820 NULL,
1821 0);
1822}
1823
1824/* --- Unicode Escape Codec ----------------------------------------------- */
1825
Fredrik Lundh06d12682001-01-24 07:59:11 +00001826static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001827
Guido van Rossumd57fd912000-03-10 22:53:23 +00001828PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001829 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001830 const char *errors)
1831{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001833 Py_ssize_t startinpos;
1834 Py_ssize_t endinpos;
1835 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001836 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001838 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001840 char* message;
1841 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001842 PyObject *errorHandler = NULL;
1843 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001844
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 /* Escaped strings will always be longer than the resulting
1846 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001847 length after conversion to the true value.
1848 (but if the error callback returns a long replacement string
1849 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850 v = _PyUnicode_New(size);
1851 if (v == NULL)
1852 goto onError;
1853 if (size == 0)
1854 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001855
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001856 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001858
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859 while (s < end) {
1860 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001861 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863
1864 /* Non-escape characters are interpreted as Unicode ordinals */
1865 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001866 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867 continue;
1868 }
1869
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001870 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 /* \ - Escapes */
1872 s++;
1873 switch (*s++) {
1874
1875 /* \x escapes */
1876 case '\n': break;
1877 case '\\': *p++ = '\\'; break;
1878 case '\'': *p++ = '\''; break;
1879 case '\"': *p++ = '\"'; break;
1880 case 'b': *p++ = '\b'; break;
1881 case 'f': *p++ = '\014'; break; /* FF */
1882 case 't': *p++ = '\t'; break;
1883 case 'n': *p++ = '\n'; break;
1884 case 'r': *p++ = '\r'; break;
1885 case 'v': *p++ = '\013'; break; /* VT */
1886 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1887
1888 /* \OOO (octal) escapes */
1889 case '0': case '1': case '2': case '3':
1890 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001891 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001892 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001893 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001894 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001895 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001896 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001897 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898 break;
1899
Fredrik Lundhccc74732001-02-18 22:13:49 +00001900 /* hex escapes */
1901 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001902 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001903 digits = 2;
1904 message = "truncated \\xXX escape";
1905 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906
Fredrik Lundhccc74732001-02-18 22:13:49 +00001907 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001908 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001909 digits = 4;
1910 message = "truncated \\uXXXX escape";
1911 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912
Fredrik Lundhccc74732001-02-18 22:13:49 +00001913 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001914 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001915 digits = 8;
1916 message = "truncated \\UXXXXXXXX escape";
1917 hexescape:
1918 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001919 outpos = p-PyUnicode_AS_UNICODE(v);
1920 if (s+digits>end) {
1921 endinpos = size;
1922 if (unicode_decode_call_errorhandler(
1923 errors, &errorHandler,
1924 "unicodeescape", "end of string in escape sequence",
1925 starts, size, &startinpos, &endinpos, &exc, &s,
1926 (PyObject **)&v, &outpos, &p))
1927 goto onError;
1928 goto nextByte;
1929 }
1930 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001931 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001932 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001933 endinpos = (s+i+1)-starts;
1934 if (unicode_decode_call_errorhandler(
1935 errors, &errorHandler,
1936 "unicodeescape", message,
1937 starts, size, &startinpos, &endinpos, &exc, &s,
1938 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001939 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001941 }
1942 chr = (chr<<4) & ~0xF;
1943 if (c >= '0' && c <= '9')
1944 chr += c - '0';
1945 else if (c >= 'a' && c <= 'f')
1946 chr += 10 + c - 'a';
1947 else
1948 chr += 10 + c - 'A';
1949 }
1950 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001951 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001952 /* _decoding_error will have already written into the
1953 target buffer. */
1954 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001955 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001956 /* when we get here, chr is a 32-bit unicode character */
1957 if (chr <= 0xffff)
1958 /* UCS-2 character */
1959 *p++ = (Py_UNICODE) chr;
1960 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001961 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001962 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001963#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001964 *p++ = chr;
1965#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001966 chr -= 0x10000L;
1967 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001968 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001969#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001970 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001971 endinpos = s-starts;
1972 outpos = p-PyUnicode_AS_UNICODE(v);
1973 if (unicode_decode_call_errorhandler(
1974 errors, &errorHandler,
1975 "unicodeescape", "illegal Unicode character",
1976 starts, size, &startinpos, &endinpos, &exc, &s,
1977 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001978 goto onError;
1979 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001980 break;
1981
1982 /* \N{name} */
1983 case 'N':
1984 message = "malformed \\N character escape";
1985 if (ucnhash_CAPI == NULL) {
1986 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001987 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001988 m = PyImport_ImportModule("unicodedata");
1989 if (m == NULL)
1990 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001991 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001992 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001993 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001994 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001995 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001996 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001997 if (ucnhash_CAPI == NULL)
1998 goto ucnhashError;
1999 }
2000 if (*s == '{') {
2001 const char *start = s+1;
2002 /* look for the closing brace */
2003 while (*s != '}' && s < end)
2004 s++;
2005 if (s > start && s < end && *s == '}') {
2006 /* found a name. look it up in the unicode database */
2007 message = "unknown Unicode character name";
2008 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002009 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002010 goto store;
2011 }
2012 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002013 endinpos = s-starts;
2014 outpos = p-PyUnicode_AS_UNICODE(v);
2015 if (unicode_decode_call_errorhandler(
2016 errors, &errorHandler,
2017 "unicodeescape", message,
2018 starts, size, &startinpos, &endinpos, &exc, &s,
2019 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002020 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002021 break;
2022
2023 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002024 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002025 message = "\\ at end of string";
2026 s--;
2027 endinpos = s-starts;
2028 outpos = p-PyUnicode_AS_UNICODE(v);
2029 if (unicode_decode_call_errorhandler(
2030 errors, &errorHandler,
2031 "unicodeescape", message,
2032 starts, size, &startinpos, &endinpos, &exc, &s,
2033 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002034 goto onError;
2035 }
2036 else {
2037 *p++ = '\\';
2038 *p++ = (unsigned char)s[-1];
2039 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002040 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002042 nextByte:
2043 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002045 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002046 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002047 Py_XDECREF(errorHandler);
2048 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002050
Fredrik Lundhccc74732001-02-18 22:13:49 +00002051ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002052 PyErr_SetString(
2053 PyExc_UnicodeError,
2054 "\\N escapes not supported (can't load unicodedata module)"
2055 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002056 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002057 Py_XDECREF(errorHandler);
2058 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002059 return NULL;
2060
Fredrik Lundhccc74732001-02-18 22:13:49 +00002061onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002063 Py_XDECREF(errorHandler);
2064 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 return NULL;
2066}
2067
2068/* Return a Unicode-Escape string version of the Unicode object.
2069
2070 If quotes is true, the string is enclosed in u"" or u'' quotes as
2071 appropriate.
2072
2073*/
2074
Thomas Wouters477c8d52006-05-27 19:21:47 +00002075Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2076 Py_ssize_t size,
2077 Py_UNICODE ch)
2078{
2079 /* like wcschr, but doesn't stop at NULL characters */
2080
2081 while (size-- > 0) {
2082 if (*s == ch)
2083 return s;
2084 s++;
2085 }
2086
2087 return NULL;
2088}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002089
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090static
2091PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002092 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 int quotes)
2094{
2095 PyObject *repr;
2096 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002098 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099
Thomas Wouters89f507f2006-12-13 04:49:30 +00002100 /* XXX(nnorwitz): rather than over-allocating, it would be
2101 better to choose a different scheme. Perhaps scan the
2102 first N-chars of the string and allocate based on that size.
2103 */
2104 /* Initial allocation is based on the longest-possible unichr
2105 escape.
2106
2107 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2108 unichr, so in this case it's the longest unichr escape. In
2109 narrow (UTF-16) builds this is five chars per source unichr
2110 since there are two unichrs in the surrogate pair, so in narrow
2111 (UTF-16) builds it's not the longest unichr escape.
2112
2113 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2114 so in the narrow (UTF-16) build case it's the longest unichr
2115 escape.
2116 */
2117
2118 repr = PyString_FromStringAndSize(NULL,
2119 2
2120#ifdef Py_UNICODE_WIDE
2121 + 10*size
2122#else
2123 + 6*size
2124#endif
2125 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126 if (repr == NULL)
2127 return NULL;
2128
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002129 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130
2131 if (quotes) {
Tim Petersced69f82003-09-16 20:30:58 +00002132 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 !findchar(s, size, '"')) ? '"' : '\'';
2134 }
2135 while (size-- > 0) {
2136 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002137
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002138 /* Escape quotes and backslashes */
2139 if ((quotes &&
Guido van Rossum572dbf82007-04-27 23:53:51 +00002140 ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141 *p++ = '\\';
2142 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002143 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002144 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002145
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002146#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002147 /* Map 21-bit characters to '\U00xxxxxx' */
2148 else if (ch >= 0x10000) {
2149 *p++ = '\\';
2150 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002151 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2152 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2153 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2154 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2155 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2156 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2157 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002158 *p++ = hexdigit[ch & 0x0000000F];
2159 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002160 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002161#else
2162 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002163 else if (ch >= 0xD800 && ch < 0xDC00) {
2164 Py_UNICODE ch2;
2165 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002166
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002167 ch2 = *s++;
2168 size--;
2169 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2170 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2171 *p++ = '\\';
2172 *p++ = 'U';
2173 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2174 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2175 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2176 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2177 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2178 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2179 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2180 *p++ = hexdigit[ucs & 0x0000000F];
2181 continue;
2182 }
2183 /* Fall through: isolated surrogates are copied as-is */
2184 s--;
2185 size++;
2186 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002187#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002188
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002190 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191 *p++ = '\\';
2192 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002193 *p++ = hexdigit[(ch >> 12) & 0x000F];
2194 *p++ = hexdigit[(ch >> 8) & 0x000F];
2195 *p++ = hexdigit[(ch >> 4) & 0x000F];
2196 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002198
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002199 /* Map special whitespace to '\t', \n', '\r' */
2200 else if (ch == '\t') {
2201 *p++ = '\\';
2202 *p++ = 't';
2203 }
2204 else if (ch == '\n') {
2205 *p++ = '\\';
2206 *p++ = 'n';
2207 }
2208 else if (ch == '\r') {
2209 *p++ = '\\';
2210 *p++ = 'r';
2211 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002212
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002213 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002214 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002216 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002217 *p++ = hexdigit[(ch >> 4) & 0x000F];
2218 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002219 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002220
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 /* Copy everything else as-is */
2222 else
2223 *p++ = (char) ch;
2224 }
2225 if (quotes)
Guido van Rossum572dbf82007-04-27 23:53:51 +00002226 *p++ = PyString_AS_STRING(repr)[0];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227
2228 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002229 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 return repr;
2231}
2232
2233PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002234 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002235{
2236 return unicodeescape_string(s, size, 0);
2237}
2238
2239PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2240{
2241 if (!PyUnicode_Check(unicode)) {
2242 PyErr_BadArgument();
2243 return NULL;
2244 }
2245 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2246 PyUnicode_GET_SIZE(unicode));
2247}
2248
2249/* --- Raw Unicode Escape Codec ------------------------------------------- */
2250
2251PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002252 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002253 const char *errors)
2254{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002255 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002256 Py_ssize_t startinpos;
2257 Py_ssize_t endinpos;
2258 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002260 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 const char *end;
2262 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002263 PyObject *errorHandler = NULL;
2264 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002265
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266 /* Escaped strings will always be longer than the resulting
2267 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002268 length after conversion to the true value. (But decoding error
2269 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270 v = _PyUnicode_New(size);
2271 if (v == NULL)
2272 goto onError;
2273 if (size == 0)
2274 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002275 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 end = s + size;
2277 while (s < end) {
2278 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002279 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002281 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282
2283 /* Non-escape characters are interpreted as Unicode ordinals */
2284 if (*s != '\\') {
2285 *p++ = (unsigned char)*s++;
2286 continue;
2287 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002288 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289
2290 /* \u-escapes are only interpreted iff the number of leading
2291 backslashes if odd */
2292 bs = s;
2293 for (;s < end;) {
2294 if (*s != '\\')
2295 break;
2296 *p++ = (unsigned char)*s++;
2297 }
2298 if (((s - bs) & 1) == 0 ||
2299 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002300 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301 continue;
2302 }
2303 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002304 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 s++;
2306
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002307 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002308 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002309 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002310 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002311 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002312 endinpos = s-starts;
2313 if (unicode_decode_call_errorhandler(
2314 errors, &errorHandler,
2315 "rawunicodeescape", "truncated \\uXXXX",
2316 starts, size, &startinpos, &endinpos, &exc, &s,
2317 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002319 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002320 }
2321 x = (x<<4) & ~0xF;
2322 if (c >= '0' && c <= '9')
2323 x += c - '0';
2324 else if (c >= 'a' && c <= 'f')
2325 x += 10 + c - 'a';
2326 else
2327 x += 10 + c - 'A';
2328 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002329#ifndef Py_UNICODE_WIDE
2330 if (x > 0x10000) {
2331 if (unicode_decode_call_errorhandler(
2332 errors, &errorHandler,
2333 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2334 starts, size, &startinpos, &endinpos, &exc, &s,
2335 (PyObject **)&v, &outpos, &p))
2336 goto onError;
2337 }
2338#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002339 *p++ = x;
2340 nextByte:
2341 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002342 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002343 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002344 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002345 Py_XDECREF(errorHandler);
2346 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002347 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002348
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 onError:
2350 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002351 Py_XDECREF(errorHandler);
2352 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353 return NULL;
2354}
2355
2356PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002357 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358{
2359 PyObject *repr;
2360 char *p;
2361 char *q;
2362
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002363 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002364
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002365#ifdef Py_UNICODE_WIDE
2366 repr = PyString_FromStringAndSize(NULL, 10 * size);
2367#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002368 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002369#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370 if (repr == NULL)
2371 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002372 if (size == 0)
2373 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374
2375 p = q = PyString_AS_STRING(repr);
2376 while (size-- > 0) {
2377 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002378#ifdef Py_UNICODE_WIDE
2379 /* Map 32-bit characters to '\Uxxxxxxxx' */
2380 if (ch >= 0x10000) {
2381 *p++ = '\\';
2382 *p++ = 'U';
2383 *p++ = hexdigit[(ch >> 28) & 0xf];
2384 *p++ = hexdigit[(ch >> 24) & 0xf];
2385 *p++ = hexdigit[(ch >> 20) & 0xf];
2386 *p++ = hexdigit[(ch >> 16) & 0xf];
2387 *p++ = hexdigit[(ch >> 12) & 0xf];
2388 *p++ = hexdigit[(ch >> 8) & 0xf];
2389 *p++ = hexdigit[(ch >> 4) & 0xf];
2390 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002391 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002392 else
2393#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394 /* Map 16-bit characters to '\uxxxx' */
2395 if (ch >= 256) {
2396 *p++ = '\\';
2397 *p++ = 'u';
2398 *p++ = hexdigit[(ch >> 12) & 0xf];
2399 *p++ = hexdigit[(ch >> 8) & 0xf];
2400 *p++ = hexdigit[(ch >> 4) & 0xf];
2401 *p++ = hexdigit[ch & 15];
2402 }
2403 /* Copy everything else as-is */
2404 else
2405 *p++ = (char) ch;
2406 }
2407 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002408 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002409 return repr;
2410}
2411
2412PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2413{
2414 if (!PyUnicode_Check(unicode)) {
2415 PyErr_BadArgument();
2416 return NULL;
2417 }
2418 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2419 PyUnicode_GET_SIZE(unicode));
2420}
2421
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002422/* --- Unicode Internal Codec ------------------------------------------- */
2423
2424PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002425 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002426 const char *errors)
2427{
2428 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002429 Py_ssize_t startinpos;
2430 Py_ssize_t endinpos;
2431 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002432 PyUnicodeObject *v;
2433 Py_UNICODE *p;
2434 const char *end;
2435 const char *reason;
2436 PyObject *errorHandler = NULL;
2437 PyObject *exc = NULL;
2438
Neal Norwitzd43069c2006-01-08 01:12:10 +00002439#ifdef Py_UNICODE_WIDE
2440 Py_UNICODE unimax = PyUnicode_GetMax();
2441#endif
2442
Thomas Wouters89f507f2006-12-13 04:49:30 +00002443 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002444 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2445 if (v == NULL)
2446 goto onError;
2447 if (PyUnicode_GetSize((PyObject *)v) == 0)
2448 return (PyObject *)v;
2449 p = PyUnicode_AS_UNICODE(v);
2450 end = s + size;
2451
2452 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002453 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002454 /* We have to sanity check the raw data, otherwise doom looms for
2455 some malformed UCS-4 data. */
2456 if (
2457 #ifdef Py_UNICODE_WIDE
2458 *p > unimax || *p < 0 ||
2459 #endif
2460 end-s < Py_UNICODE_SIZE
2461 )
2462 {
2463 startinpos = s - starts;
2464 if (end-s < Py_UNICODE_SIZE) {
2465 endinpos = end-starts;
2466 reason = "truncated input";
2467 }
2468 else {
2469 endinpos = s - starts + Py_UNICODE_SIZE;
2470 reason = "illegal code point (> 0x10FFFF)";
2471 }
2472 outpos = p - PyUnicode_AS_UNICODE(v);
2473 if (unicode_decode_call_errorhandler(
2474 errors, &errorHandler,
2475 "unicode_internal", reason,
2476 starts, size, &startinpos, &endinpos, &exc, &s,
2477 (PyObject **)&v, &outpos, &p)) {
2478 goto onError;
2479 }
2480 }
2481 else {
2482 p++;
2483 s += Py_UNICODE_SIZE;
2484 }
2485 }
2486
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002487 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002488 goto onError;
2489 Py_XDECREF(errorHandler);
2490 Py_XDECREF(exc);
2491 return (PyObject *)v;
2492
2493 onError:
2494 Py_XDECREF(v);
2495 Py_XDECREF(errorHandler);
2496 Py_XDECREF(exc);
2497 return NULL;
2498}
2499
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500/* --- Latin-1 Codec ------------------------------------------------------ */
2501
2502PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002503 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 const char *errors)
2505{
2506 PyUnicodeObject *v;
2507 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002508
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002510 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002511 Py_UNICODE r = *(unsigned char*)s;
2512 return PyUnicode_FromUnicode(&r, 1);
2513 }
2514
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 v = _PyUnicode_New(size);
2516 if (v == NULL)
2517 goto onError;
2518 if (size == 0)
2519 return (PyObject *)v;
2520 p = PyUnicode_AS_UNICODE(v);
2521 while (size-- > 0)
2522 *p++ = (unsigned char)*s++;
2523 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002524
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 onError:
2526 Py_XDECREF(v);
2527 return NULL;
2528}
2529
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002530/* create or adjust a UnicodeEncodeError */
2531static void make_encode_exception(PyObject **exceptionObject,
2532 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002533 const Py_UNICODE *unicode, Py_ssize_t size,
2534 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002535 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002537 if (*exceptionObject == NULL) {
2538 *exceptionObject = PyUnicodeEncodeError_Create(
2539 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 }
2541 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002542 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2543 goto onError;
2544 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2545 goto onError;
2546 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2547 goto onError;
2548 return;
2549 onError:
2550 Py_DECREF(*exceptionObject);
2551 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552 }
2553}
2554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002555/* raises a UnicodeEncodeError */
2556static void raise_encode_exception(PyObject **exceptionObject,
2557 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002558 const Py_UNICODE *unicode, Py_ssize_t size,
2559 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002560 const char *reason)
2561{
2562 make_encode_exception(exceptionObject,
2563 encoding, unicode, size, startpos, endpos, reason);
2564 if (*exceptionObject != NULL)
2565 PyCodec_StrictErrors(*exceptionObject);
2566}
2567
2568/* error handling callback helper:
2569 build arguments, call the callback and check the arguments,
2570 put the result into newpos and return the replacement string, which
2571 has to be freed by the caller */
2572static PyObject *unicode_encode_call_errorhandler(const char *errors,
2573 PyObject **errorHandler,
2574 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002575 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2576 Py_ssize_t startpos, Py_ssize_t endpos,
2577 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002578{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002579 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002580
2581 PyObject *restuple;
2582 PyObject *resunicode;
2583
2584 if (*errorHandler == NULL) {
2585 *errorHandler = PyCodec_LookupError(errors);
2586 if (*errorHandler == NULL)
2587 return NULL;
2588 }
2589
2590 make_encode_exception(exceptionObject,
2591 encoding, unicode, size, startpos, endpos, reason);
2592 if (*exceptionObject == NULL)
2593 return NULL;
2594
2595 restuple = PyObject_CallFunctionObjArgs(
2596 *errorHandler, *exceptionObject, NULL);
2597 if (restuple == NULL)
2598 return NULL;
2599 if (!PyTuple_Check(restuple)) {
2600 PyErr_Format(PyExc_TypeError, &argparse[4]);
2601 Py_DECREF(restuple);
2602 return NULL;
2603 }
2604 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2605 &resunicode, newpos)) {
2606 Py_DECREF(restuple);
2607 return NULL;
2608 }
2609 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002610 *newpos = size+*newpos;
2611 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002612 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002613 Py_DECREF(restuple);
2614 return NULL;
2615 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002616 Py_INCREF(resunicode);
2617 Py_DECREF(restuple);
2618 return resunicode;
2619}
2620
2621static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002622 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002623 const char *errors,
2624 int limit)
2625{
2626 /* output object */
2627 PyObject *res;
2628 /* pointers to the beginning and end+1 of input */
2629 const Py_UNICODE *startp = p;
2630 const Py_UNICODE *endp = p + size;
2631 /* pointer to the beginning of the unencodable characters */
2632 /* const Py_UNICODE *badp = NULL; */
2633 /* pointer into the output */
2634 char *str;
2635 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002636 Py_ssize_t respos = 0;
2637 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002638 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2639 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 PyObject *errorHandler = NULL;
2641 PyObject *exc = NULL;
2642 /* the following variable is used for caching string comparisons
2643 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2644 int known_errorHandler = -1;
2645
2646 /* allocate enough for a simple encoding without
2647 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002648 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002649 if (res == NULL)
2650 goto onError;
2651 if (size == 0)
2652 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002653 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002654 ressize = size;
2655
2656 while (p<endp) {
2657 Py_UNICODE c = *p;
2658
2659 /* can we encode this? */
2660 if (c<limit) {
2661 /* no overflow check, because we know that the space is enough */
2662 *str++ = (char)c;
2663 ++p;
2664 }
2665 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002666 Py_ssize_t unicodepos = p-startp;
2667 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002668 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002669 Py_ssize_t repsize;
2670 Py_ssize_t newpos;
2671 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002672 Py_UNICODE *uni2;
2673 /* startpos for collecting unencodable chars */
2674 const Py_UNICODE *collstart = p;
2675 const Py_UNICODE *collend = p;
2676 /* find all unecodable characters */
2677 while ((collend < endp) && ((*collend)>=limit))
2678 ++collend;
2679 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2680 if (known_errorHandler==-1) {
2681 if ((errors==NULL) || (!strcmp(errors, "strict")))
2682 known_errorHandler = 1;
2683 else if (!strcmp(errors, "replace"))
2684 known_errorHandler = 2;
2685 else if (!strcmp(errors, "ignore"))
2686 known_errorHandler = 3;
2687 else if (!strcmp(errors, "xmlcharrefreplace"))
2688 known_errorHandler = 4;
2689 else
2690 known_errorHandler = 0;
2691 }
2692 switch (known_errorHandler) {
2693 case 1: /* strict */
2694 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2695 goto onError;
2696 case 2: /* replace */
2697 while (collstart++<collend)
2698 *str++ = '?'; /* fall through */
2699 case 3: /* ignore */
2700 p = collend;
2701 break;
2702 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002703 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002704 /* determine replacement size (temporarily (mis)uses p) */
2705 for (p = collstart, repsize = 0; p < collend; ++p) {
2706 if (*p<10)
2707 repsize += 2+1+1;
2708 else if (*p<100)
2709 repsize += 2+2+1;
2710 else if (*p<1000)
2711 repsize += 2+3+1;
2712 else if (*p<10000)
2713 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002714#ifndef Py_UNICODE_WIDE
2715 else
2716 repsize += 2+5+1;
2717#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 else if (*p<100000)
2719 repsize += 2+5+1;
2720 else if (*p<1000000)
2721 repsize += 2+6+1;
2722 else
2723 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002724#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 }
2726 requiredsize = respos+repsize+(endp-collend);
2727 if (requiredsize > ressize) {
2728 if (requiredsize<2*ressize)
2729 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002730 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002732 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 ressize = requiredsize;
2734 }
2735 /* generate replacement (temporarily (mis)uses p) */
2736 for (p = collstart; p < collend; ++p) {
2737 str += sprintf(str, "&#%d;", (int)*p);
2738 }
2739 p = collend;
2740 break;
2741 default:
2742 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2743 encoding, reason, startp, size, &exc,
2744 collstart-startp, collend-startp, &newpos);
2745 if (repunicode == NULL)
2746 goto onError;
2747 /* need more space? (at least enough for what we
2748 have+the replacement+the rest of the string, so
2749 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002750 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 repsize = PyUnicode_GET_SIZE(repunicode);
2752 requiredsize = respos+repsize+(endp-collend);
2753 if (requiredsize > ressize) {
2754 if (requiredsize<2*ressize)
2755 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002756 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002757 Py_DECREF(repunicode);
2758 goto onError;
2759 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002760 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002761 ressize = requiredsize;
2762 }
2763 /* check if there is anything unencodable in the replacement
2764 and copy it to the output */
2765 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2766 c = *uni2;
2767 if (c >= limit) {
2768 raise_encode_exception(&exc, encoding, startp, size,
2769 unicodepos, unicodepos+1, reason);
2770 Py_DECREF(repunicode);
2771 goto onError;
2772 }
2773 *str = (char)c;
2774 }
2775 p = startp + newpos;
2776 Py_DECREF(repunicode);
2777 }
2778 }
2779 }
2780 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002781 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 if (respos<ressize)
2783 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002784 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 Py_XDECREF(errorHandler);
2786 Py_XDECREF(exc);
2787 return res;
2788
2789 onError:
2790 Py_XDECREF(res);
2791 Py_XDECREF(errorHandler);
2792 Py_XDECREF(exc);
2793 return NULL;
2794}
2795
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002797 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 const char *errors)
2799{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801}
2802
2803PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2804{
2805 if (!PyUnicode_Check(unicode)) {
2806 PyErr_BadArgument();
2807 return NULL;
2808 }
2809 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2810 PyUnicode_GET_SIZE(unicode),
2811 NULL);
2812}
2813
2814/* --- 7-bit ASCII Codec -------------------------------------------------- */
2815
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002817 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 const char *errors)
2819{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 PyUnicodeObject *v;
2822 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002823 Py_ssize_t startinpos;
2824 Py_ssize_t endinpos;
2825 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 const char *e;
2827 PyObject *errorHandler = NULL;
2828 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002829
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002831 if (size == 1 && *(unsigned char*)s < 128) {
2832 Py_UNICODE r = *(unsigned char*)s;
2833 return PyUnicode_FromUnicode(&r, 1);
2834 }
Tim Petersced69f82003-09-16 20:30:58 +00002835
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 v = _PyUnicode_New(size);
2837 if (v == NULL)
2838 goto onError;
2839 if (size == 0)
2840 return (PyObject *)v;
2841 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 e = s + size;
2843 while (s < e) {
2844 register unsigned char c = (unsigned char)*s;
2845 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002847 ++s;
2848 }
2849 else {
2850 startinpos = s-starts;
2851 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002852 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002853 if (unicode_decode_call_errorhandler(
2854 errors, &errorHandler,
2855 "ascii", "ordinal not in range(128)",
2856 starts, size, &startinpos, &endinpos, &exc, &s,
2857 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002859 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002861 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002862 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002863 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002864 Py_XDECREF(errorHandler);
2865 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002867
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 onError:
2869 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002870 Py_XDECREF(errorHandler);
2871 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872 return NULL;
2873}
2874
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002876 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 const char *errors)
2878{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880}
2881
2882PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2883{
2884 if (!PyUnicode_Check(unicode)) {
2885 PyErr_BadArgument();
2886 return NULL;
2887 }
2888 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2889 PyUnicode_GET_SIZE(unicode),
2890 NULL);
2891}
2892
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002893#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002894
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002895/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002896
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002897#if SIZEOF_INT < SIZEOF_SSIZE_T
2898#define NEED_RETRY
2899#endif
2900
2901/* XXX This code is limited to "true" double-byte encodings, as
2902 a) it assumes an incomplete character consists of a single byte, and
2903 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2904 encodings, see IsDBCSLeadByteEx documentation. */
2905
2906static int is_dbcs_lead_byte(const char *s, int offset)
2907{
2908 const char *curr = s + offset;
2909
2910 if (IsDBCSLeadByte(*curr)) {
2911 const char *prev = CharPrev(s, curr);
2912 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2913 }
2914 return 0;
2915}
2916
2917/*
2918 * Decode MBCS string into unicode object. If 'final' is set, converts
2919 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2920 */
2921static int decode_mbcs(PyUnicodeObject **v,
2922 const char *s, /* MBCS string */
2923 int size, /* sizeof MBCS string */
2924 int final)
2925{
2926 Py_UNICODE *p;
2927 Py_ssize_t n = 0;
2928 int usize = 0;
2929
2930 assert(size >= 0);
2931
2932 /* Skip trailing lead-byte unless 'final' is set */
2933 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2934 --size;
2935
2936 /* First get the size of the result */
2937 if (size > 0) {
2938 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2939 if (usize == 0) {
2940 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2941 return -1;
2942 }
2943 }
2944
2945 if (*v == NULL) {
2946 /* Create unicode object */
2947 *v = _PyUnicode_New(usize);
2948 if (*v == NULL)
2949 return -1;
2950 }
2951 else {
2952 /* Extend unicode object */
2953 n = PyUnicode_GET_SIZE(*v);
2954 if (_PyUnicode_Resize(v, n + usize) < 0)
2955 return -1;
2956 }
2957
2958 /* Do the conversion */
2959 if (size > 0) {
2960 p = PyUnicode_AS_UNICODE(*v) + n;
2961 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2962 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2963 return -1;
2964 }
2965 }
2966
2967 return size;
2968}
2969
2970PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2971 Py_ssize_t size,
2972 const char *errors,
2973 Py_ssize_t *consumed)
2974{
2975 PyUnicodeObject *v = NULL;
2976 int done;
2977
2978 if (consumed)
2979 *consumed = 0;
2980
2981#ifdef NEED_RETRY
2982 retry:
2983 if (size > INT_MAX)
2984 done = decode_mbcs(&v, s, INT_MAX, 0);
2985 else
2986#endif
2987 done = decode_mbcs(&v, s, (int)size, !consumed);
2988
2989 if (done < 0) {
2990 Py_XDECREF(v);
2991 return NULL;
2992 }
2993
2994 if (consumed)
2995 *consumed += done;
2996
2997#ifdef NEED_RETRY
2998 if (size > INT_MAX) {
2999 s += done;
3000 size -= done;
3001 goto retry;
3002 }
3003#endif
3004
3005 return (PyObject *)v;
3006}
3007
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003008PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003009 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003010 const char *errors)
3011{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003012 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3013}
3014
3015/*
3016 * Convert unicode into string object (MBCS).
3017 * Returns 0 if succeed, -1 otherwise.
3018 */
3019static int encode_mbcs(PyObject **repr,
3020 const Py_UNICODE *p, /* unicode */
3021 int size) /* size of unicode */
3022{
3023 int mbcssize = 0;
3024 Py_ssize_t n = 0;
3025
3026 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003027
3028 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003029 if (size > 0) {
3030 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3031 if (mbcssize == 0) {
3032 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3033 return -1;
3034 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003035 }
3036
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003037 if (*repr == NULL) {
3038 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003039 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003040 if (*repr == NULL)
3041 return -1;
3042 }
3043 else {
3044 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003045 n = PyBytes_Size(*repr);
3046 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003047 return -1;
3048 }
3049
3050 /* Do the conversion */
3051 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003052 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003053 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3054 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3055 return -1;
3056 }
3057 }
3058
3059 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003060}
3061
3062PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003063 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003064 const char *errors)
3065{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003066 PyObject *repr = NULL;
3067 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003068
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003069#ifdef NEED_RETRY
3070 retry:
3071 if (size > INT_MAX)
3072 ret = encode_mbcs(&repr, p, INT_MAX);
3073 else
3074#endif
3075 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003076
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003077 if (ret < 0) {
3078 Py_XDECREF(repr);
3079 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003080 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003081
3082#ifdef NEED_RETRY
3083 if (size > INT_MAX) {
3084 p += INT_MAX;
3085 size -= INT_MAX;
3086 goto retry;
3087 }
3088#endif
3089
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003090 return repr;
3091}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003092
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003093PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3094{
3095 if (!PyUnicode_Check(unicode)) {
3096 PyErr_BadArgument();
3097 return NULL;
3098 }
3099 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3100 PyUnicode_GET_SIZE(unicode),
3101 NULL);
3102}
3103
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003104#undef NEED_RETRY
3105
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003106#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003107
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108/* --- Character Mapping Codec -------------------------------------------- */
3109
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003111 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112 PyObject *mapping,
3113 const char *errors)
3114{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003115 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003116 Py_ssize_t startinpos;
3117 Py_ssize_t endinpos;
3118 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003119 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 PyUnicodeObject *v;
3121 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003122 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003123 PyObject *errorHandler = NULL;
3124 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003125 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003126 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003127
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 /* Default to Latin-1 */
3129 if (mapping == NULL)
3130 return PyUnicode_DecodeLatin1(s, size, errors);
3131
3132 v = _PyUnicode_New(size);
3133 if (v == NULL)
3134 goto onError;
3135 if (size == 0)
3136 return (PyObject *)v;
3137 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003138 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003139 if (PyUnicode_CheckExact(mapping)) {
3140 mapstring = PyUnicode_AS_UNICODE(mapping);
3141 maplen = PyUnicode_GET_SIZE(mapping);
3142 while (s < e) {
3143 unsigned char ch = *s;
3144 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003146 if (ch < maplen)
3147 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003149 if (x == 0xfffe) {
3150 /* undefined mapping */
3151 outpos = p-PyUnicode_AS_UNICODE(v);
3152 startinpos = s-starts;
3153 endinpos = startinpos+1;
3154 if (unicode_decode_call_errorhandler(
3155 errors, &errorHandler,
3156 "charmap", "character maps to <undefined>",
3157 starts, size, &startinpos, &endinpos, &exc, &s,
3158 (PyObject **)&v, &outpos, &p)) {
3159 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003160 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003161 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003162 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003163 *p++ = x;
3164 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003166 }
3167 else {
3168 while (s < e) {
3169 unsigned char ch = *s;
3170 PyObject *w, *x;
3171
3172 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3173 w = PyInt_FromLong((long)ch);
3174 if (w == NULL)
3175 goto onError;
3176 x = PyObject_GetItem(mapping, w);
3177 Py_DECREF(w);
3178 if (x == NULL) {
3179 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3180 /* No mapping found means: mapping is undefined. */
3181 PyErr_Clear();
3182 x = Py_None;
3183 Py_INCREF(x);
3184 } else
3185 goto onError;
3186 }
3187
3188 /* Apply mapping */
3189 if (PyInt_Check(x)) {
3190 long value = PyInt_AS_LONG(x);
3191 if (value < 0 || value > 65535) {
3192 PyErr_SetString(PyExc_TypeError,
3193 "character mapping must be in range(65536)");
3194 Py_DECREF(x);
3195 goto onError;
3196 }
3197 *p++ = (Py_UNICODE)value;
3198 }
3199 else if (x == Py_None) {
3200 /* undefined mapping */
3201 outpos = p-PyUnicode_AS_UNICODE(v);
3202 startinpos = s-starts;
3203 endinpos = startinpos+1;
3204 if (unicode_decode_call_errorhandler(
3205 errors, &errorHandler,
3206 "charmap", "character maps to <undefined>",
3207 starts, size, &startinpos, &endinpos, &exc, &s,
3208 (PyObject **)&v, &outpos, &p)) {
3209 Py_DECREF(x);
3210 goto onError;
3211 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003212 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003213 continue;
3214 }
3215 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003216 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003217
3218 if (targetsize == 1)
3219 /* 1-1 mapping */
3220 *p++ = *PyUnicode_AS_UNICODE(x);
3221
3222 else if (targetsize > 1) {
3223 /* 1-n mapping */
3224 if (targetsize > extrachars) {
3225 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003226 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3227 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003228 (targetsize << 2);
3229 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003230 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003231 if (_PyUnicode_Resize(&v,
3232 PyUnicode_GET_SIZE(v) + needed) < 0) {
3233 Py_DECREF(x);
3234 goto onError;
3235 }
3236 p = PyUnicode_AS_UNICODE(v) + oldpos;
3237 }
3238 Py_UNICODE_COPY(p,
3239 PyUnicode_AS_UNICODE(x),
3240 targetsize);
3241 p += targetsize;
3242 extrachars -= targetsize;
3243 }
3244 /* 1-0 mapping: skip the character */
3245 }
3246 else {
3247 /* wrong return value */
3248 PyErr_SetString(PyExc_TypeError,
3249 "character mapping must return integer, None or unicode");
3250 Py_DECREF(x);
3251 goto onError;
3252 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003254 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 }
3257 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003258 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003260 Py_XDECREF(errorHandler);
3261 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003263
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003265 Py_XDECREF(errorHandler);
3266 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 Py_XDECREF(v);
3268 return NULL;
3269}
3270
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003271/* Charmap encoding: the lookup table */
3272
3273struct encoding_map{
3274 PyObject_HEAD
3275 unsigned char level1[32];
3276 int count2, count3;
3277 unsigned char level23[1];
3278};
3279
3280static PyObject*
3281encoding_map_size(PyObject *obj, PyObject* args)
3282{
3283 struct encoding_map *map = (struct encoding_map*)obj;
3284 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3285 128*map->count3);
3286}
3287
3288static PyMethodDef encoding_map_methods[] = {
3289 {"size", encoding_map_size, METH_NOARGS,
3290 PyDoc_STR("Return the size (in bytes) of this object") },
3291 { 0 }
3292};
3293
3294static void
3295encoding_map_dealloc(PyObject* o)
3296{
3297 PyObject_FREE(o);
3298}
3299
3300static PyTypeObject EncodingMapType = {
3301 PyObject_HEAD_INIT(NULL)
3302 0, /*ob_size*/
3303 "EncodingMap", /*tp_name*/
3304 sizeof(struct encoding_map), /*tp_basicsize*/
3305 0, /*tp_itemsize*/
3306 /* methods */
3307 encoding_map_dealloc, /*tp_dealloc*/
3308 0, /*tp_print*/
3309 0, /*tp_getattr*/
3310 0, /*tp_setattr*/
3311 0, /*tp_compare*/
3312 0, /*tp_repr*/
3313 0, /*tp_as_number*/
3314 0, /*tp_as_sequence*/
3315 0, /*tp_as_mapping*/
3316 0, /*tp_hash*/
3317 0, /*tp_call*/
3318 0, /*tp_str*/
3319 0, /*tp_getattro*/
3320 0, /*tp_setattro*/
3321 0, /*tp_as_buffer*/
3322 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3323 0, /*tp_doc*/
3324 0, /*tp_traverse*/
3325 0, /*tp_clear*/
3326 0, /*tp_richcompare*/
3327 0, /*tp_weaklistoffset*/
3328 0, /*tp_iter*/
3329 0, /*tp_iternext*/
3330 encoding_map_methods, /*tp_methods*/
3331 0, /*tp_members*/
3332 0, /*tp_getset*/
3333 0, /*tp_base*/
3334 0, /*tp_dict*/
3335 0, /*tp_descr_get*/
3336 0, /*tp_descr_set*/
3337 0, /*tp_dictoffset*/
3338 0, /*tp_init*/
3339 0, /*tp_alloc*/
3340 0, /*tp_new*/
3341 0, /*tp_free*/
3342 0, /*tp_is_gc*/
3343};
3344
3345PyObject*
3346PyUnicode_BuildEncodingMap(PyObject* string)
3347{
3348 Py_UNICODE *decode;
3349 PyObject *result;
3350 struct encoding_map *mresult;
3351 int i;
3352 int need_dict = 0;
3353 unsigned char level1[32];
3354 unsigned char level2[512];
3355 unsigned char *mlevel1, *mlevel2, *mlevel3;
3356 int count2 = 0, count3 = 0;
3357
3358 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3359 PyErr_BadArgument();
3360 return NULL;
3361 }
3362 decode = PyUnicode_AS_UNICODE(string);
3363 memset(level1, 0xFF, sizeof level1);
3364 memset(level2, 0xFF, sizeof level2);
3365
3366 /* If there isn't a one-to-one mapping of NULL to \0,
3367 or if there are non-BMP characters, we need to use
3368 a mapping dictionary. */
3369 if (decode[0] != 0)
3370 need_dict = 1;
3371 for (i = 1; i < 256; i++) {
3372 int l1, l2;
3373 if (decode[i] == 0
3374 #ifdef Py_UNICODE_WIDE
3375 || decode[i] > 0xFFFF
3376 #endif
3377 ) {
3378 need_dict = 1;
3379 break;
3380 }
3381 if (decode[i] == 0xFFFE)
3382 /* unmapped character */
3383 continue;
3384 l1 = decode[i] >> 11;
3385 l2 = decode[i] >> 7;
3386 if (level1[l1] == 0xFF)
3387 level1[l1] = count2++;
3388 if (level2[l2] == 0xFF)
3389 level2[l2] = count3++;
3390 }
3391
3392 if (count2 >= 0xFF || count3 >= 0xFF)
3393 need_dict = 1;
3394
3395 if (need_dict) {
3396 PyObject *result = PyDict_New();
3397 PyObject *key, *value;
3398 if (!result)
3399 return NULL;
3400 for (i = 0; i < 256; i++) {
3401 key = value = NULL;
3402 key = PyInt_FromLong(decode[i]);
3403 value = PyInt_FromLong(i);
3404 if (!key || !value)
3405 goto failed1;
3406 if (PyDict_SetItem(result, key, value) == -1)
3407 goto failed1;
3408 Py_DECREF(key);
3409 Py_DECREF(value);
3410 }
3411 return result;
3412 failed1:
3413 Py_XDECREF(key);
3414 Py_XDECREF(value);
3415 Py_DECREF(result);
3416 return NULL;
3417 }
3418
3419 /* Create a three-level trie */
3420 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3421 16*count2 + 128*count3 - 1);
3422 if (!result)
3423 return PyErr_NoMemory();
3424 PyObject_Init(result, &EncodingMapType);
3425 mresult = (struct encoding_map*)result;
3426 mresult->count2 = count2;
3427 mresult->count3 = count3;
3428 mlevel1 = mresult->level1;
3429 mlevel2 = mresult->level23;
3430 mlevel3 = mresult->level23 + 16*count2;
3431 memcpy(mlevel1, level1, 32);
3432 memset(mlevel2, 0xFF, 16*count2);
3433 memset(mlevel3, 0, 128*count3);
3434 count3 = 0;
3435 for (i = 1; i < 256; i++) {
3436 int o1, o2, o3, i2, i3;
3437 if (decode[i] == 0xFFFE)
3438 /* unmapped character */
3439 continue;
3440 o1 = decode[i]>>11;
3441 o2 = (decode[i]>>7) & 0xF;
3442 i2 = 16*mlevel1[o1] + o2;
3443 if (mlevel2[i2] == 0xFF)
3444 mlevel2[i2] = count3++;
3445 o3 = decode[i] & 0x7F;
3446 i3 = 128*mlevel2[i2] + o3;
3447 mlevel3[i3] = i;
3448 }
3449 return result;
3450}
3451
3452static int
3453encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3454{
3455 struct encoding_map *map = (struct encoding_map*)mapping;
3456 int l1 = c>>11;
3457 int l2 = (c>>7) & 0xF;
3458 int l3 = c & 0x7F;
3459 int i;
3460
3461#ifdef Py_UNICODE_WIDE
3462 if (c > 0xFFFF) {
3463 return -1;
3464 }
3465#endif
3466 if (c == 0)
3467 return 0;
3468 /* level 1*/
3469 i = map->level1[l1];
3470 if (i == 0xFF) {
3471 return -1;
3472 }
3473 /* level 2*/
3474 i = map->level23[16*i+l2];
3475 if (i == 0xFF) {
3476 return -1;
3477 }
3478 /* level 3 */
3479 i = map->level23[16*map->count2 + 128*i + l3];
3480 if (i == 0) {
3481 return -1;
3482 }
3483 return i;
3484}
3485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486/* Lookup the character ch in the mapping. If the character
3487 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003488 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491 PyObject *w = PyInt_FromLong((long)c);
3492 PyObject *x;
3493
3494 if (w == NULL)
3495 return NULL;
3496 x = PyObject_GetItem(mapping, w);
3497 Py_DECREF(w);
3498 if (x == NULL) {
3499 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3500 /* No mapping found means: mapping is undefined. */
3501 PyErr_Clear();
3502 x = Py_None;
3503 Py_INCREF(x);
3504 return x;
3505 } else
3506 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003508 else if (x == Py_None)
3509 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510 else if (PyInt_Check(x)) {
3511 long value = PyInt_AS_LONG(x);
3512 if (value < 0 || value > 255) {
3513 PyErr_SetString(PyExc_TypeError,
3514 "character mapping must be in range(256)");
3515 Py_DECREF(x);
3516 return NULL;
3517 }
3518 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520 else if (PyString_Check(x))
3521 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523 /* wrong return value */
3524 PyErr_SetString(PyExc_TypeError,
3525 "character mapping must return integer, None or str");
3526 Py_DECREF(x);
3527 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003528 }
3529}
3530
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003531static int
3532charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3533{
3534 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3535 /* exponentially overallocate to minimize reallocations */
3536 if (requiredsize < 2*outsize)
3537 requiredsize = 2*outsize;
3538 if (_PyString_Resize(outobj, requiredsize)) {
3539 return 0;
3540 }
3541 return 1;
3542}
3543
3544typedef enum charmapencode_result {
3545 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3546}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547/* lookup the character, put the result in the output string and adjust
3548 various state variables. Reallocate the output string if not enough
3549 space is available. Return a new reference to the object that
3550 was put in the output buffer, or Py_None, if the mapping was undefined
3551 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003552 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003554charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003555 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003557 PyObject *rep;
3558 char *outstart;
3559 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003561 if (mapping->ob_type == &EncodingMapType) {
3562 int res = encoding_map_lookup(c, mapping);
3563 Py_ssize_t requiredsize = *outpos+1;
3564 if (res == -1)
3565 return enc_FAILED;
3566 if (outsize<requiredsize)
3567 if (!charmapencode_resize(outobj, outpos, requiredsize))
3568 return enc_EXCEPTION;
3569 outstart = PyString_AS_STRING(*outobj);
3570 outstart[(*outpos)++] = (char)res;
3571 return enc_SUCCESS;
3572 }
3573
3574 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003576 return enc_EXCEPTION;
3577 else if (rep==Py_None) {
3578 Py_DECREF(rep);
3579 return enc_FAILED;
3580 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003582 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003583 if (outsize<requiredsize)
3584 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003586 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003588 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3590 }
3591 else {
3592 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003593 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3594 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003595 if (outsize<requiredsize)
3596 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003598 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003600 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 memcpy(outstart + *outpos, repchars, repsize);
3602 *outpos += repsize;
3603 }
3604 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003605 Py_DECREF(rep);
3606 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607}
3608
3609/* handle an error in PyUnicode_EncodeCharmap
3610 Return 0 on success, -1 on error */
3611static
3612int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003613 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003615 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003616 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617{
3618 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003619 Py_ssize_t repsize;
3620 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 Py_UNICODE *uni2;
3622 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003623 Py_ssize_t collstartpos = *inpos;
3624 Py_ssize_t collendpos = *inpos+1;
3625 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003626 char *encoding = "charmap";
3627 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003628 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003630 /* find all unencodable characters */
3631 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003632 PyObject *rep;
3633 if (mapping->ob_type == &EncodingMapType) {
3634 int res = encoding_map_lookup(p[collendpos], mapping);
3635 if (res != -1)
3636 break;
3637 ++collendpos;
3638 continue;
3639 }
3640
3641 rep = charmapencode_lookup(p[collendpos], mapping);
3642 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003644 else if (rep!=Py_None) {
3645 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646 break;
3647 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003648 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 ++collendpos;
3650 }
3651 /* cache callback name lookup
3652 * (if not done yet, i.e. it's the first error) */
3653 if (*known_errorHandler==-1) {
3654 if ((errors==NULL) || (!strcmp(errors, "strict")))
3655 *known_errorHandler = 1;
3656 else if (!strcmp(errors, "replace"))
3657 *known_errorHandler = 2;
3658 else if (!strcmp(errors, "ignore"))
3659 *known_errorHandler = 3;
3660 else if (!strcmp(errors, "xmlcharrefreplace"))
3661 *known_errorHandler = 4;
3662 else
3663 *known_errorHandler = 0;
3664 }
3665 switch (*known_errorHandler) {
3666 case 1: /* strict */
3667 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3668 return -1;
3669 case 2: /* replace */
3670 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3671 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003672 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003673 return -1;
3674 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003675 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3677 return -1;
3678 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 }
3680 /* fall through */
3681 case 3: /* ignore */
3682 *inpos = collendpos;
3683 break;
3684 case 4: /* xmlcharrefreplace */
3685 /* generate replacement (temporarily (mis)uses p) */
3686 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3687 char buffer[2+29+1+1];
3688 char *cp;
3689 sprintf(buffer, "&#%d;", (int)p[collpos]);
3690 for (cp = buffer; *cp; ++cp) {
3691 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003692 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003693 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003694 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3696 return -1;
3697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003698 }
3699 }
3700 *inpos = collendpos;
3701 break;
3702 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003703 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003704 encoding, reason, p, size, exceptionObject,
3705 collstartpos, collendpos, &newpos);
3706 if (repunicode == NULL)
3707 return -1;
3708 /* generate replacement */
3709 repsize = PyUnicode_GET_SIZE(repunicode);
3710 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3711 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003712 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 return -1;
3714 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003715 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003717 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3718 return -1;
3719 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 }
3721 *inpos = newpos;
3722 Py_DECREF(repunicode);
3723 }
3724 return 0;
3725}
3726
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003728 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 PyObject *mapping,
3730 const char *errors)
3731{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 /* output object */
3733 PyObject *res = NULL;
3734 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003735 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003736 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003737 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003738 PyObject *errorHandler = NULL;
3739 PyObject *exc = NULL;
3740 /* the following variable is used for caching string comparisons
3741 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3742 * 3=ignore, 4=xmlcharrefreplace */
3743 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744
3745 /* Default to Latin-1 */
3746 if (mapping == NULL)
3747 return PyUnicode_EncodeLatin1(p, size, errors);
3748
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003749 /* allocate enough for a simple encoding without
3750 replacements, if we need more, we'll resize */
3751 res = PyString_FromStringAndSize(NULL, size);
3752 if (res == NULL)
3753 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003754 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003755 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003757 while (inpos<size) {
3758 /* try to encode it */
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003759 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3760 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003762 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003763 if (charmap_encoding_error(p, size, &inpos, mapping,
3764 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003765 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003766 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003767 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 else
3771 /* done with this character => adjust input position */
3772 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 /* Resize if we allocated to much */
3776 if (respos<PyString_GET_SIZE(res)) {
3777 if (_PyString_Resize(&res, respos))
3778 goto onError;
3779 }
3780 Py_XDECREF(exc);
3781 Py_XDECREF(errorHandler);
3782 return res;
3783
3784 onError:
3785 Py_XDECREF(res);
3786 Py_XDECREF(exc);
3787 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788 return NULL;
3789}
3790
3791PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3792 PyObject *mapping)
3793{
3794 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3795 PyErr_BadArgument();
3796 return NULL;
3797 }
3798 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3799 PyUnicode_GET_SIZE(unicode),
3800 mapping,
3801 NULL);
3802}
3803
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003804/* create or adjust a UnicodeTranslateError */
3805static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003806 const Py_UNICODE *unicode, Py_ssize_t size,
3807 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 if (*exceptionObject == NULL) {
3811 *exceptionObject = PyUnicodeTranslateError_Create(
3812 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 }
3814 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3816 goto onError;
3817 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3818 goto onError;
3819 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3820 goto onError;
3821 return;
3822 onError:
3823 Py_DECREF(*exceptionObject);
3824 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 }
3826}
3827
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828/* raises a UnicodeTranslateError */
3829static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003830 const Py_UNICODE *unicode, Py_ssize_t size,
3831 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003832 const char *reason)
3833{
3834 make_translate_exception(exceptionObject,
3835 unicode, size, startpos, endpos, reason);
3836 if (*exceptionObject != NULL)
3837 PyCodec_StrictErrors(*exceptionObject);
3838}
3839
3840/* error handling callback helper:
3841 build arguments, call the callback and check the arguments,
3842 put the result into newpos and return the replacement string, which
3843 has to be freed by the caller */
3844static PyObject *unicode_translate_call_errorhandler(const char *errors,
3845 PyObject **errorHandler,
3846 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003847 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3848 Py_ssize_t startpos, Py_ssize_t endpos,
3849 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003850{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003851 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003852
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003853 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003854 PyObject *restuple;
3855 PyObject *resunicode;
3856
3857 if (*errorHandler == NULL) {
3858 *errorHandler = PyCodec_LookupError(errors);
3859 if (*errorHandler == NULL)
3860 return NULL;
3861 }
3862
3863 make_translate_exception(exceptionObject,
3864 unicode, size, startpos, endpos, reason);
3865 if (*exceptionObject == NULL)
3866 return NULL;
3867
3868 restuple = PyObject_CallFunctionObjArgs(
3869 *errorHandler, *exceptionObject, NULL);
3870 if (restuple == NULL)
3871 return NULL;
3872 if (!PyTuple_Check(restuple)) {
3873 PyErr_Format(PyExc_TypeError, &argparse[4]);
3874 Py_DECREF(restuple);
3875 return NULL;
3876 }
3877 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003878 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003879 Py_DECREF(restuple);
3880 return NULL;
3881 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003882 if (i_newpos<0)
3883 *newpos = size+i_newpos;
3884 else
3885 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003886 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003887 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003888 Py_DECREF(restuple);
3889 return NULL;
3890 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003891 Py_INCREF(resunicode);
3892 Py_DECREF(restuple);
3893 return resunicode;
3894}
3895
3896/* Lookup the character ch in the mapping and put the result in result,
3897 which must be decrefed by the caller.
3898 Return 0 on success, -1 on error */
3899static
3900int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3901{
3902 PyObject *w = PyInt_FromLong((long)c);
3903 PyObject *x;
3904
3905 if (w == NULL)
3906 return -1;
3907 x = PyObject_GetItem(mapping, w);
3908 Py_DECREF(w);
3909 if (x == NULL) {
3910 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3911 /* No mapping found means: use 1:1 mapping. */
3912 PyErr_Clear();
3913 *result = NULL;
3914 return 0;
3915 } else
3916 return -1;
3917 }
3918 else if (x == Py_None) {
3919 *result = x;
3920 return 0;
3921 }
3922 else if (PyInt_Check(x)) {
3923 long value = PyInt_AS_LONG(x);
3924 long max = PyUnicode_GetMax();
3925 if (value < 0 || value > max) {
3926 PyErr_Format(PyExc_TypeError,
3927 "character mapping must be in range(0x%lx)", max+1);
3928 Py_DECREF(x);
3929 return -1;
3930 }
3931 *result = x;
3932 return 0;
3933 }
3934 else if (PyUnicode_Check(x)) {
3935 *result = x;
3936 return 0;
3937 }
3938 else {
3939 /* wrong return value */
3940 PyErr_SetString(PyExc_TypeError,
3941 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003942 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003943 return -1;
3944 }
3945}
3946/* ensure that *outobj is at least requiredsize characters long,
3947if not reallocate and adjust various state variables.
3948Return 0 on success, -1 on error */
3949static
Walter Dörwald4894c302003-10-24 14:25:28 +00003950int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003951 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003953 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003954 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003956 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003958 if (requiredsize < 2 * oldsize)
3959 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003960 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 return -1;
3962 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003963 }
3964 return 0;
3965}
3966/* lookup the character, put the result in the output string and adjust
3967 various state variables. Return a new reference to the object that
3968 was put in the output buffer in *result, or Py_None, if the mapping was
3969 undefined (in which case no character was written).
3970 The called must decref result.
3971 Return 0 on success, -1 on error. */
3972static
Walter Dörwald4894c302003-10-24 14:25:28 +00003973int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003974 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003975 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003976{
Walter Dörwald4894c302003-10-24 14:25:28 +00003977 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978 return -1;
3979 if (*res==NULL) {
3980 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003981 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003982 }
3983 else if (*res==Py_None)
3984 ;
3985 else if (PyInt_Check(*res)) {
3986 /* no overflow check, because we know that the space is enough */
3987 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3988 }
3989 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003990 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991 if (repsize==1) {
3992 /* no overflow check, because we know that the space is enough */
3993 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3994 }
3995 else if (repsize!=0) {
3996 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003997 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003998 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003999 repsize - 1;
4000 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001 return -1;
4002 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4003 *outp += repsize;
4004 }
4005 }
4006 else
4007 return -1;
4008 return 0;
4009}
4010
4011PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004012 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004013 PyObject *mapping,
4014 const char *errors)
4015{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016 /* output object */
4017 PyObject *res = NULL;
4018 /* pointers to the beginning and end+1 of input */
4019 const Py_UNICODE *startp = p;
4020 const Py_UNICODE *endp = p + size;
4021 /* pointer into the output */
4022 Py_UNICODE *str;
4023 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004024 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025 char *reason = "character maps to <undefined>";
4026 PyObject *errorHandler = NULL;
4027 PyObject *exc = NULL;
4028 /* the following variable is used for caching string comparisons
4029 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4030 * 3=ignore, 4=xmlcharrefreplace */
4031 int known_errorHandler = -1;
4032
Guido van Rossumd57fd912000-03-10 22:53:23 +00004033 if (mapping == NULL) {
4034 PyErr_BadArgument();
4035 return NULL;
4036 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037
4038 /* allocate enough for a simple 1:1 translation without
4039 replacements, if we need more, we'll resize */
4040 res = PyUnicode_FromUnicode(NULL, size);
4041 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004042 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004043 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044 return res;
4045 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 while (p<endp) {
4048 /* try to encode it */
4049 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004050 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004051 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 goto onError;
4053 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004054 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055 if (x!=Py_None) /* it worked => adjust input pointer */
4056 ++p;
4057 else { /* untranslatable character */
4058 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004059 Py_ssize_t repsize;
4060 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 Py_UNICODE *uni2;
4062 /* startpos for collecting untranslatable chars */
4063 const Py_UNICODE *collstart = p;
4064 const Py_UNICODE *collend = p+1;
4065 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 /* find all untranslatable characters */
4068 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004069 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070 goto onError;
4071 Py_XDECREF(x);
4072 if (x!=Py_None)
4073 break;
4074 ++collend;
4075 }
4076 /* cache callback name lookup
4077 * (if not done yet, i.e. it's the first error) */
4078 if (known_errorHandler==-1) {
4079 if ((errors==NULL) || (!strcmp(errors, "strict")))
4080 known_errorHandler = 1;
4081 else if (!strcmp(errors, "replace"))
4082 known_errorHandler = 2;
4083 else if (!strcmp(errors, "ignore"))
4084 known_errorHandler = 3;
4085 else if (!strcmp(errors, "xmlcharrefreplace"))
4086 known_errorHandler = 4;
4087 else
4088 known_errorHandler = 0;
4089 }
4090 switch (known_errorHandler) {
4091 case 1: /* strict */
4092 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4093 goto onError;
4094 case 2: /* replace */
4095 /* No need to check for space, this is a 1:1 replacement */
4096 for (coll = collstart; coll<collend; ++coll)
4097 *str++ = '?';
4098 /* fall through */
4099 case 3: /* ignore */
4100 p = collend;
4101 break;
4102 case 4: /* xmlcharrefreplace */
4103 /* generate replacement (temporarily (mis)uses p) */
4104 for (p = collstart; p < collend; ++p) {
4105 char buffer[2+29+1+1];
4106 char *cp;
4107 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004108 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4110 goto onError;
4111 for (cp = buffer; *cp; ++cp)
4112 *str++ = *cp;
4113 }
4114 p = collend;
4115 break;
4116 default:
4117 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4118 reason, startp, size, &exc,
4119 collstart-startp, collend-startp, &newpos);
4120 if (repunicode == NULL)
4121 goto onError;
4122 /* generate replacement */
4123 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004124 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4126 Py_DECREF(repunicode);
4127 goto onError;
4128 }
4129 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4130 *str++ = *uni2;
4131 p = startp + newpos;
4132 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 }
4134 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136 /* Resize if we allocated to much */
4137 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004138 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004139 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004140 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 }
4142 Py_XDECREF(exc);
4143 Py_XDECREF(errorHandler);
4144 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146 onError:
4147 Py_XDECREF(res);
4148 Py_XDECREF(exc);
4149 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 return NULL;
4151}
4152
4153PyObject *PyUnicode_Translate(PyObject *str,
4154 PyObject *mapping,
4155 const char *errors)
4156{
4157 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004158
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 str = PyUnicode_FromObject(str);
4160 if (str == NULL)
4161 goto onError;
4162 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4163 PyUnicode_GET_SIZE(str),
4164 mapping,
4165 errors);
4166 Py_DECREF(str);
4167 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004168
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 onError:
4170 Py_XDECREF(str);
4171 return NULL;
4172}
Tim Petersced69f82003-09-16 20:30:58 +00004173
Guido van Rossum9e896b32000-04-05 20:11:21 +00004174/* --- Decimal Encoder ---------------------------------------------------- */
4175
4176int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004177 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004178 char *output,
4179 const char *errors)
4180{
4181 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182 PyObject *errorHandler = NULL;
4183 PyObject *exc = NULL;
4184 const char *encoding = "decimal";
4185 const char *reason = "invalid decimal Unicode string";
4186 /* the following variable is used for caching string comparisons
4187 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4188 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004189
4190 if (output == NULL) {
4191 PyErr_BadArgument();
4192 return -1;
4193 }
4194
4195 p = s;
4196 end = s + length;
4197 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004199 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004201 Py_ssize_t repsize;
4202 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 Py_UNICODE *uni2;
4204 Py_UNICODE *collstart;
4205 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004206
Guido van Rossum9e896b32000-04-05 20:11:21 +00004207 if (Py_UNICODE_ISSPACE(ch)) {
4208 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004210 continue;
4211 }
4212 decimal = Py_UNICODE_TODECIMAL(ch);
4213 if (decimal >= 0) {
4214 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004216 continue;
4217 }
Guido van Rossumba477042000-04-06 18:18:10 +00004218 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004219 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004220 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004221 continue;
4222 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 /* All other characters are considered unencodable */
4224 collstart = p;
4225 collend = p+1;
4226 while (collend < end) {
4227 if ((0 < *collend && *collend < 256) ||
4228 !Py_UNICODE_ISSPACE(*collend) ||
4229 Py_UNICODE_TODECIMAL(*collend))
4230 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004231 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 /* cache callback name lookup
4233 * (if not done yet, i.e. it's the first error) */
4234 if (known_errorHandler==-1) {
4235 if ((errors==NULL) || (!strcmp(errors, "strict")))
4236 known_errorHandler = 1;
4237 else if (!strcmp(errors, "replace"))
4238 known_errorHandler = 2;
4239 else if (!strcmp(errors, "ignore"))
4240 known_errorHandler = 3;
4241 else if (!strcmp(errors, "xmlcharrefreplace"))
4242 known_errorHandler = 4;
4243 else
4244 known_errorHandler = 0;
4245 }
4246 switch (known_errorHandler) {
4247 case 1: /* strict */
4248 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4249 goto onError;
4250 case 2: /* replace */
4251 for (p = collstart; p < collend; ++p)
4252 *output++ = '?';
4253 /* fall through */
4254 case 3: /* ignore */
4255 p = collend;
4256 break;
4257 case 4: /* xmlcharrefreplace */
4258 /* generate replacement (temporarily (mis)uses p) */
4259 for (p = collstart; p < collend; ++p)
4260 output += sprintf(output, "&#%d;", (int)*p);
4261 p = collend;
4262 break;
4263 default:
4264 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4265 encoding, reason, s, length, &exc,
4266 collstart-s, collend-s, &newpos);
4267 if (repunicode == NULL)
4268 goto onError;
4269 /* generate replacement */
4270 repsize = PyUnicode_GET_SIZE(repunicode);
4271 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4272 Py_UNICODE ch = *uni2;
4273 if (Py_UNICODE_ISSPACE(ch))
4274 *output++ = ' ';
4275 else {
4276 decimal = Py_UNICODE_TODECIMAL(ch);
4277 if (decimal >= 0)
4278 *output++ = '0' + decimal;
4279 else if (0 < ch && ch < 256)
4280 *output++ = (char)ch;
4281 else {
4282 Py_DECREF(repunicode);
4283 raise_encode_exception(&exc, encoding,
4284 s, length, collstart-s, collend-s, reason);
4285 goto onError;
4286 }
4287 }
4288 }
4289 p = s + newpos;
4290 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004291 }
4292 }
4293 /* 0-terminate the output string */
4294 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 Py_XDECREF(exc);
4296 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004297 return 0;
4298
4299 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300 Py_XDECREF(exc);
4301 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004302 return -1;
4303}
4304
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305/* --- Helpers ------------------------------------------------------------ */
4306
Thomas Wouters477c8d52006-05-27 19:21:47 +00004307#define STRINGLIB_CHAR Py_UNICODE
4308
4309#define STRINGLIB_LEN PyUnicode_GET_SIZE
4310#define STRINGLIB_NEW PyUnicode_FromUnicode
4311#define STRINGLIB_STR PyUnicode_AS_UNICODE
4312
4313Py_LOCAL_INLINE(int)
4314STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004316 if (str[0] != other[0])
4317 return 1;
4318 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319}
4320
Thomas Wouters477c8d52006-05-27 19:21:47 +00004321#define STRINGLIB_EMPTY unicode_empty
4322
4323#include "stringlib/fastsearch.h"
4324
4325#include "stringlib/count.h"
4326#include "stringlib/find.h"
4327#include "stringlib/partition.h"
4328
4329/* helper macro to fixup start/end slice values */
4330#define FIX_START_END(obj) \
4331 if (start < 0) \
4332 start += (obj)->length; \
4333 if (start < 0) \
4334 start = 0; \
4335 if (end > (obj)->length) \
4336 end = (obj)->length; \
4337 if (end < 0) \
4338 end += (obj)->length; \
4339 if (end < 0) \
4340 end = 0;
4341
Martin v. Löwis18e16552006-02-15 17:27:45 +00004342Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004343 PyObject *substr,
4344 Py_ssize_t start,
4345 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004347 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004348 PyUnicodeObject* str_obj;
4349 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004350
Thomas Wouters477c8d52006-05-27 19:21:47 +00004351 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4352 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004354 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4355 if (!sub_obj) {
4356 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357 return -1;
4358 }
Tim Petersced69f82003-09-16 20:30:58 +00004359
Thomas Wouters477c8d52006-05-27 19:21:47 +00004360 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004361
Thomas Wouters477c8d52006-05-27 19:21:47 +00004362 result = stringlib_count(
4363 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4364 );
4365
4366 Py_DECREF(sub_obj);
4367 Py_DECREF(str_obj);
4368
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369 return result;
4370}
4371
Martin v. Löwis18e16552006-02-15 17:27:45 +00004372Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004373 PyObject *sub,
4374 Py_ssize_t start,
4375 Py_ssize_t end,
4376 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004377{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004378 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004379
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004381 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004382 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004383 sub = PyUnicode_FromObject(sub);
4384 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004385 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004386 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387 }
Tim Petersced69f82003-09-16 20:30:58 +00004388
Thomas Wouters477c8d52006-05-27 19:21:47 +00004389 if (direction > 0)
4390 result = stringlib_find_slice(
4391 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4392 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4393 start, end
4394 );
4395 else
4396 result = stringlib_rfind_slice(
4397 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4398 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4399 start, end
4400 );
4401
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004403 Py_DECREF(sub);
4404
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 return result;
4406}
4407
Tim Petersced69f82003-09-16 20:30:58 +00004408static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409int tailmatch(PyUnicodeObject *self,
4410 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004411 Py_ssize_t start,
4412 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413 int direction)
4414{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415 if (substring->length == 0)
4416 return 1;
4417
Thomas Wouters477c8d52006-05-27 19:21:47 +00004418 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419
4420 end -= substring->length;
4421 if (end < start)
4422 return 0;
4423
4424 if (direction > 0) {
4425 if (Py_UNICODE_MATCH(self, end, substring))
4426 return 1;
4427 } else {
4428 if (Py_UNICODE_MATCH(self, start, substring))
4429 return 1;
4430 }
4431
4432 return 0;
4433}
4434
Martin v. Löwis18e16552006-02-15 17:27:45 +00004435Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004437 Py_ssize_t start,
4438 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 int direction)
4440{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004441 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004442
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 str = PyUnicode_FromObject(str);
4444 if (str == NULL)
4445 return -1;
4446 substr = PyUnicode_FromObject(substr);
4447 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004448 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449 return -1;
4450 }
Tim Petersced69f82003-09-16 20:30:58 +00004451
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 result = tailmatch((PyUnicodeObject *)str,
4453 (PyUnicodeObject *)substr,
4454 start, end, direction);
4455 Py_DECREF(str);
4456 Py_DECREF(substr);
4457 return result;
4458}
4459
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460/* Apply fixfct filter to the Unicode object self and return a
4461 reference to the modified object */
4462
Tim Petersced69f82003-09-16 20:30:58 +00004463static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004464PyObject *fixup(PyUnicodeObject *self,
4465 int (*fixfct)(PyUnicodeObject *s))
4466{
4467
4468 PyUnicodeObject *u;
4469
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004470 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 if (u == NULL)
4472 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004473
4474 Py_UNICODE_COPY(u->str, self->str, self->length);
4475
Tim Peters7a29bd52001-09-12 03:03:31 +00004476 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 /* fixfct should return TRUE if it modified the buffer. If
4478 FALSE, return a reference to the original buffer instead
4479 (to save space, not time) */
4480 Py_INCREF(self);
4481 Py_DECREF(u);
4482 return (PyObject*) self;
4483 }
4484 return (PyObject*) u;
4485}
4486
Tim Petersced69f82003-09-16 20:30:58 +00004487static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488int fixupper(PyUnicodeObject *self)
4489{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004490 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491 Py_UNICODE *s = self->str;
4492 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004493
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494 while (len-- > 0) {
4495 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004496
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497 ch = Py_UNICODE_TOUPPER(*s);
4498 if (ch != *s) {
4499 status = 1;
4500 *s = ch;
4501 }
4502 s++;
4503 }
4504
4505 return status;
4506}
4507
Tim Petersced69f82003-09-16 20:30:58 +00004508static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509int fixlower(PyUnicodeObject *self)
4510{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004511 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512 Py_UNICODE *s = self->str;
4513 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004514
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515 while (len-- > 0) {
4516 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004517
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 ch = Py_UNICODE_TOLOWER(*s);
4519 if (ch != *s) {
4520 status = 1;
4521 *s = ch;
4522 }
4523 s++;
4524 }
4525
4526 return status;
4527}
4528
Tim Petersced69f82003-09-16 20:30:58 +00004529static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530int fixswapcase(PyUnicodeObject *self)
4531{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004532 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533 Py_UNICODE *s = self->str;
4534 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004535
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 while (len-- > 0) {
4537 if (Py_UNICODE_ISUPPER(*s)) {
4538 *s = Py_UNICODE_TOLOWER(*s);
4539 status = 1;
4540 } else if (Py_UNICODE_ISLOWER(*s)) {
4541 *s = Py_UNICODE_TOUPPER(*s);
4542 status = 1;
4543 }
4544 s++;
4545 }
4546
4547 return status;
4548}
4549
Tim Petersced69f82003-09-16 20:30:58 +00004550static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551int fixcapitalize(PyUnicodeObject *self)
4552{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004553 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004554 Py_UNICODE *s = self->str;
4555 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004556
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004557 if (len == 0)
4558 return 0;
4559 if (Py_UNICODE_ISLOWER(*s)) {
4560 *s = Py_UNICODE_TOUPPER(*s);
4561 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004563 s++;
4564 while (--len > 0) {
4565 if (Py_UNICODE_ISUPPER(*s)) {
4566 *s = Py_UNICODE_TOLOWER(*s);
4567 status = 1;
4568 }
4569 s++;
4570 }
4571 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572}
4573
4574static
4575int fixtitle(PyUnicodeObject *self)
4576{
4577 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4578 register Py_UNICODE *e;
4579 int previous_is_cased;
4580
4581 /* Shortcut for single character strings */
4582 if (PyUnicode_GET_SIZE(self) == 1) {
4583 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4584 if (*p != ch) {
4585 *p = ch;
4586 return 1;
4587 }
4588 else
4589 return 0;
4590 }
Tim Petersced69f82003-09-16 20:30:58 +00004591
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592 e = p + PyUnicode_GET_SIZE(self);
4593 previous_is_cased = 0;
4594 for (; p < e; p++) {
4595 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004596
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597 if (previous_is_cased)
4598 *p = Py_UNICODE_TOLOWER(ch);
4599 else
4600 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004601
4602 if (Py_UNICODE_ISLOWER(ch) ||
4603 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 Py_UNICODE_ISTITLE(ch))
4605 previous_is_cased = 1;
4606 else
4607 previous_is_cased = 0;
4608 }
4609 return 1;
4610}
4611
Tim Peters8ce9f162004-08-27 01:49:32 +00004612PyObject *
4613PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614{
Tim Peters8ce9f162004-08-27 01:49:32 +00004615 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004616 const Py_UNICODE blank = ' ';
4617 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004618 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004619 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004620 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4621 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004622 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4623 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004624 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004625 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004626 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627
Tim Peters05eba1f2004-08-27 21:32:02 +00004628 fseq = PySequence_Fast(seq, "");
4629 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004630 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004631 }
4632
Tim Peters91879ab2004-08-27 22:35:44 +00004633 /* Grrrr. A codec may be invoked to convert str objects to
4634 * Unicode, and so it's possible to call back into Python code
4635 * during PyUnicode_FromObject(), and so it's possible for a sick
4636 * codec to change the size of fseq (if seq is a list). Therefore
4637 * we have to keep refetching the size -- can't assume seqlen
4638 * is invariant.
4639 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004640 seqlen = PySequence_Fast_GET_SIZE(fseq);
4641 /* If empty sequence, return u"". */
4642 if (seqlen == 0) {
4643 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4644 goto Done;
4645 }
4646 /* If singleton sequence with an exact Unicode, return that. */
4647 if (seqlen == 1) {
4648 item = PySequence_Fast_GET_ITEM(fseq, 0);
4649 if (PyUnicode_CheckExact(item)) {
4650 Py_INCREF(item);
4651 res = (PyUnicodeObject *)item;
4652 goto Done;
4653 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004654 }
4655
Tim Peters05eba1f2004-08-27 21:32:02 +00004656 /* At least two items to join, or one that isn't exact Unicode. */
4657 if (seqlen > 1) {
4658 /* Set up sep and seplen -- they're needed. */
4659 if (separator == NULL) {
4660 sep = &blank;
4661 seplen = 1;
4662 }
4663 else {
4664 internal_separator = PyUnicode_FromObject(separator);
4665 if (internal_separator == NULL)
4666 goto onError;
4667 sep = PyUnicode_AS_UNICODE(internal_separator);
4668 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004669 /* In case PyUnicode_FromObject() mutated seq. */
4670 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004671 }
4672 }
4673
4674 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004675 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004676 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004677 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004678 res_p = PyUnicode_AS_UNICODE(res);
4679 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004680
Tim Peters05eba1f2004-08-27 21:32:02 +00004681 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004682 Py_ssize_t itemlen;
4683 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004684
4685 item = PySequence_Fast_GET_ITEM(fseq, i);
4686 /* Convert item to Unicode. */
4687 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4688 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004689 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004690 " %.80s found",
4691 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004692 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004693 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004694 item = PyUnicode_FromObject(item);
4695 if (item == NULL)
4696 goto onError;
4697 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004698
Tim Peters91879ab2004-08-27 22:35:44 +00004699 /* In case PyUnicode_FromObject() mutated seq. */
4700 seqlen = PySequence_Fast_GET_SIZE(fseq);
4701
Tim Peters8ce9f162004-08-27 01:49:32 +00004702 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004704 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004705 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004706 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004707 if (i < seqlen - 1) {
4708 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004709 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004710 goto Overflow;
4711 }
4712 if (new_res_used > res_alloc) {
4713 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004714 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004715 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004716 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004717 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004718 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004719 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004720 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004722 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004723 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004725
4726 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004727 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004728 res_p += itemlen;
4729 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004730 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004731 res_p += seplen;
4732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004734 res_used = new_res_used;
4735 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004736
Tim Peters05eba1f2004-08-27 21:32:02 +00004737 /* Shrink res to match the used area; this probably can't fail,
4738 * but it's cheap to check.
4739 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004740 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004741 goto onError;
4742
4743 Done:
4744 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004745 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 return (PyObject *)res;
4747
Tim Peters8ce9f162004-08-27 01:49:32 +00004748 Overflow:
4749 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004750 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004751 Py_DECREF(item);
4752 /* fall through */
4753
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004755 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004756 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004757 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758 return NULL;
4759}
4760
Tim Petersced69f82003-09-16 20:30:58 +00004761static
4762PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004763 Py_ssize_t left,
4764 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765 Py_UNICODE fill)
4766{
4767 PyUnicodeObject *u;
4768
4769 if (left < 0)
4770 left = 0;
4771 if (right < 0)
4772 right = 0;
4773
Tim Peters7a29bd52001-09-12 03:03:31 +00004774 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 Py_INCREF(self);
4776 return self;
4777 }
4778
4779 u = _PyUnicode_New(left + self->length + right);
4780 if (u) {
4781 if (left)
4782 Py_UNICODE_FILL(u->str, fill, left);
4783 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4784 if (right)
4785 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4786 }
4787
4788 return u;
4789}
4790
4791#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004792 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 if (!str) \
4794 goto onError; \
4795 if (PyList_Append(list, str)) { \
4796 Py_DECREF(str); \
4797 goto onError; \
4798 } \
4799 else \
4800 Py_DECREF(str);
4801
4802static
4803PyObject *split_whitespace(PyUnicodeObject *self,
4804 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004805 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004807 register Py_ssize_t i;
4808 register Py_ssize_t j;
4809 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 PyObject *str;
4811
4812 for (i = j = 0; i < len; ) {
4813 /* find a token */
4814 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4815 i++;
4816 j = i;
4817 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4818 i++;
4819 if (j < i) {
4820 if (maxcount-- <= 0)
4821 break;
4822 SPLIT_APPEND(self->str, j, i);
4823 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4824 i++;
4825 j = i;
4826 }
4827 }
4828 if (j < len) {
4829 SPLIT_APPEND(self->str, j, len);
4830 }
4831 return list;
4832
4833 onError:
4834 Py_DECREF(list);
4835 return NULL;
4836}
4837
4838PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004839 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004841 register Py_ssize_t i;
4842 register Py_ssize_t j;
4843 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 PyObject *list;
4845 PyObject *str;
4846 Py_UNICODE *data;
4847
4848 string = PyUnicode_FromObject(string);
4849 if (string == NULL)
4850 return NULL;
4851 data = PyUnicode_AS_UNICODE(string);
4852 len = PyUnicode_GET_SIZE(string);
4853
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 list = PyList_New(0);
4855 if (!list)
4856 goto onError;
4857
4858 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004859 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004860
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004862 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864
4865 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004866 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867 if (i < len) {
4868 if (data[i] == '\r' && i + 1 < len &&
4869 data[i+1] == '\n')
4870 i += 2;
4871 else
4872 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004873 if (keepends)
4874 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 }
Guido van Rossum86662912000-04-11 15:38:46 +00004876 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 j = i;
4878 }
4879 if (j < len) {
4880 SPLIT_APPEND(data, j, len);
4881 }
4882
4883 Py_DECREF(string);
4884 return list;
4885
4886 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004887 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 Py_DECREF(string);
4889 return NULL;
4890}
4891
Tim Petersced69f82003-09-16 20:30:58 +00004892static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893PyObject *split_char(PyUnicodeObject *self,
4894 PyObject *list,
4895 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004896 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004898 register Py_ssize_t i;
4899 register Py_ssize_t j;
4900 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 PyObject *str;
4902
4903 for (i = j = 0; i < len; ) {
4904 if (self->str[i] == ch) {
4905 if (maxcount-- <= 0)
4906 break;
4907 SPLIT_APPEND(self->str, j, i);
4908 i = j = i + 1;
4909 } else
4910 i++;
4911 }
4912 if (j <= len) {
4913 SPLIT_APPEND(self->str, j, len);
4914 }
4915 return list;
4916
4917 onError:
4918 Py_DECREF(list);
4919 return NULL;
4920}
4921
Tim Petersced69f82003-09-16 20:30:58 +00004922static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923PyObject *split_substring(PyUnicodeObject *self,
4924 PyObject *list,
4925 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004926 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004928 register Py_ssize_t i;
4929 register Py_ssize_t j;
4930 Py_ssize_t len = self->length;
4931 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932 PyObject *str;
4933
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004934 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 if (Py_UNICODE_MATCH(self, i, substring)) {
4936 if (maxcount-- <= 0)
4937 break;
4938 SPLIT_APPEND(self->str, j, i);
4939 i = j = i + sublen;
4940 } else
4941 i++;
4942 }
4943 if (j <= len) {
4944 SPLIT_APPEND(self->str, j, len);
4945 }
4946 return list;
4947
4948 onError:
4949 Py_DECREF(list);
4950 return NULL;
4951}
4952
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004953static
4954PyObject *rsplit_whitespace(PyUnicodeObject *self,
4955 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004956 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004957{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004958 register Py_ssize_t i;
4959 register Py_ssize_t j;
4960 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004961 PyObject *str;
4962
4963 for (i = j = len - 1; i >= 0; ) {
4964 /* find a token */
4965 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4966 i--;
4967 j = i;
4968 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4969 i--;
4970 if (j > i) {
4971 if (maxcount-- <= 0)
4972 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004973 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004974 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4975 i--;
4976 j = i;
4977 }
4978 }
4979 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004980 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004981 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004982 if (PyList_Reverse(list) < 0)
4983 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004984 return list;
4985
4986 onError:
4987 Py_DECREF(list);
4988 return NULL;
4989}
4990
4991static
4992PyObject *rsplit_char(PyUnicodeObject *self,
4993 PyObject *list,
4994 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004995 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004996{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004997 register Py_ssize_t i;
4998 register Py_ssize_t j;
4999 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005000 PyObject *str;
5001
5002 for (i = j = len - 1; i >= 0; ) {
5003 if (self->str[i] == ch) {
5004 if (maxcount-- <= 0)
5005 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005006 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005007 j = i = i - 1;
5008 } else
5009 i--;
5010 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005011 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005012 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005013 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005014 if (PyList_Reverse(list) < 0)
5015 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005016 return list;
5017
5018 onError:
5019 Py_DECREF(list);
5020 return NULL;
5021}
5022
5023static
5024PyObject *rsplit_substring(PyUnicodeObject *self,
5025 PyObject *list,
5026 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005027 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005028{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005029 register Py_ssize_t i;
5030 register Py_ssize_t j;
5031 Py_ssize_t len = self->length;
5032 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005033 PyObject *str;
5034
5035 for (i = len - sublen, j = len; i >= 0; ) {
5036 if (Py_UNICODE_MATCH(self, i, substring)) {
5037 if (maxcount-- <= 0)
5038 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005039 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005040 j = i;
5041 i -= sublen;
5042 } else
5043 i--;
5044 }
5045 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005046 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005047 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005048 if (PyList_Reverse(list) < 0)
5049 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005050 return list;
5051
5052 onError:
5053 Py_DECREF(list);
5054 return NULL;
5055}
5056
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057#undef SPLIT_APPEND
5058
5059static
5060PyObject *split(PyUnicodeObject *self,
5061 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005062 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063{
5064 PyObject *list;
5065
5066 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005067 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068
5069 list = PyList_New(0);
5070 if (!list)
5071 return NULL;
5072
5073 if (substring == NULL)
5074 return split_whitespace(self,list,maxcount);
5075
5076 else if (substring->length == 1)
5077 return split_char(self,list,substring->str[0],maxcount);
5078
5079 else if (substring->length == 0) {
5080 Py_DECREF(list);
5081 PyErr_SetString(PyExc_ValueError, "empty separator");
5082 return NULL;
5083 }
5084 else
5085 return split_substring(self,list,substring,maxcount);
5086}
5087
Tim Petersced69f82003-09-16 20:30:58 +00005088static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005089PyObject *rsplit(PyUnicodeObject *self,
5090 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005091 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005092{
5093 PyObject *list;
5094
5095 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005096 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005097
5098 list = PyList_New(0);
5099 if (!list)
5100 return NULL;
5101
5102 if (substring == NULL)
5103 return rsplit_whitespace(self,list,maxcount);
5104
5105 else if (substring->length == 1)
5106 return rsplit_char(self,list,substring->str[0],maxcount);
5107
5108 else if (substring->length == 0) {
5109 Py_DECREF(list);
5110 PyErr_SetString(PyExc_ValueError, "empty separator");
5111 return NULL;
5112 }
5113 else
5114 return rsplit_substring(self,list,substring,maxcount);
5115}
5116
5117static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118PyObject *replace(PyUnicodeObject *self,
5119 PyUnicodeObject *str1,
5120 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005121 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122{
5123 PyUnicodeObject *u;
5124
5125 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005126 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127
Thomas Wouters477c8d52006-05-27 19:21:47 +00005128 if (str1->length == str2->length) {
5129 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005130 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005131 if (str1->length == 1) {
5132 /* replace characters */
5133 Py_UNICODE u1, u2;
5134 if (!findchar(self->str, self->length, str1->str[0]))
5135 goto nothing;
5136 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5137 if (!u)
5138 return NULL;
5139 Py_UNICODE_COPY(u->str, self->str, self->length);
5140 u1 = str1->str[0];
5141 u2 = str2->str[0];
5142 for (i = 0; i < u->length; i++)
5143 if (u->str[i] == u1) {
5144 if (--maxcount < 0)
5145 break;
5146 u->str[i] = u2;
5147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005149 i = fastsearch(
5150 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005152 if (i < 0)
5153 goto nothing;
5154 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5155 if (!u)
5156 return NULL;
5157 Py_UNICODE_COPY(u->str, self->str, self->length);
5158 while (i <= self->length - str1->length)
5159 if (Py_UNICODE_MATCH(self, i, str1)) {
5160 if (--maxcount < 0)
5161 break;
5162 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5163 i += str1->length;
5164 } else
5165 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005168
5169 Py_ssize_t n, i, j, e;
5170 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 Py_UNICODE *p;
5172
5173 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005174 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 if (n > maxcount)
5176 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005177 if (n == 0)
5178 goto nothing;
5179 /* new_size = self->length + n * (str2->length - str1->length)); */
5180 delta = (str2->length - str1->length);
5181 if (delta == 0) {
5182 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005184 product = n * (str2->length - str1->length);
5185 if ((product / (str2->length - str1->length)) != n) {
5186 PyErr_SetString(PyExc_OverflowError,
5187 "replace string is too long");
5188 return NULL;
5189 }
5190 new_size = self->length + product;
5191 if (new_size < 0) {
5192 PyErr_SetString(PyExc_OverflowError,
5193 "replace string is too long");
5194 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 }
5196 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005197 u = _PyUnicode_New(new_size);
5198 if (!u)
5199 return NULL;
5200 i = 0;
5201 p = u->str;
5202 e = self->length - str1->length;
5203 if (str1->length > 0) {
5204 while (n-- > 0) {
5205 /* look for next match */
5206 j = i;
5207 while (j <= e) {
5208 if (Py_UNICODE_MATCH(self, j, str1))
5209 break;
5210 j++;
5211 }
5212 if (j > i) {
5213 if (j > e)
5214 break;
5215 /* copy unchanged part [i:j] */
5216 Py_UNICODE_COPY(p, self->str+i, j-i);
5217 p += j - i;
5218 }
5219 /* copy substitution string */
5220 if (str2->length > 0) {
5221 Py_UNICODE_COPY(p, str2->str, str2->length);
5222 p += str2->length;
5223 }
5224 i = j + str1->length;
5225 }
5226 if (i < self->length)
5227 /* copy tail [i:] */
5228 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5229 } else {
5230 /* interleave */
5231 while (n > 0) {
5232 Py_UNICODE_COPY(p, str2->str, str2->length);
5233 p += str2->length;
5234 if (--n <= 0)
5235 break;
5236 *p++ = self->str[i++];
5237 }
5238 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005242
5243nothing:
5244 /* nothing to replace; return original string (when possible) */
5245 if (PyUnicode_CheckExact(self)) {
5246 Py_INCREF(self);
5247 return (PyObject *) self;
5248 }
5249 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250}
5251
5252/* --- Unicode Object Methods --------------------------------------------- */
5253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005254PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255"S.title() -> unicode\n\
5256\n\
5257Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005258characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259
5260static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005261unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 return fixup(self, fixtitle);
5264}
5265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005266PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267"S.capitalize() -> unicode\n\
5268\n\
5269Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005270have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271
5272static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005273unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 return fixup(self, fixcapitalize);
5276}
5277
5278#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005279PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280"S.capwords() -> unicode\n\
5281\n\
5282Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005283normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284
5285static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005286unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287{
5288 PyObject *list;
5289 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005290 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 /* Split into words */
5293 list = split(self, NULL, -1);
5294 if (!list)
5295 return NULL;
5296
5297 /* Capitalize each word */
5298 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5299 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5300 fixcapitalize);
5301 if (item == NULL)
5302 goto onError;
5303 Py_DECREF(PyList_GET_ITEM(list, i));
5304 PyList_SET_ITEM(list, i, item);
5305 }
5306
5307 /* Join the words to form a new string */
5308 item = PyUnicode_Join(NULL, list);
5309
5310onError:
5311 Py_DECREF(list);
5312 return (PyObject *)item;
5313}
5314#endif
5315
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005316/* Argument converter. Coerces to a single unicode character */
5317
5318static int
5319convert_uc(PyObject *obj, void *addr)
5320{
5321 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5322 PyObject *uniobj;
5323 Py_UNICODE *unistr;
5324
5325 uniobj = PyUnicode_FromObject(obj);
5326 if (uniobj == NULL) {
5327 PyErr_SetString(PyExc_TypeError,
5328 "The fill character cannot be converted to Unicode");
5329 return 0;
5330 }
5331 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5332 PyErr_SetString(PyExc_TypeError,
5333 "The fill character must be exactly one character long");
5334 Py_DECREF(uniobj);
5335 return 0;
5336 }
5337 unistr = PyUnicode_AS_UNICODE(uniobj);
5338 *fillcharloc = unistr[0];
5339 Py_DECREF(uniobj);
5340 return 1;
5341}
5342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005343PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005344"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005346Return S centered in a Unicode string of length width. Padding is\n\
5347done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348
5349static PyObject *
5350unicode_center(PyUnicodeObject *self, PyObject *args)
5351{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005352 Py_ssize_t marg, left;
5353 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005354 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355
Thomas Woutersde017742006-02-16 19:34:37 +00005356 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 return NULL;
5358
Tim Peters7a29bd52001-09-12 03:03:31 +00005359 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 Py_INCREF(self);
5361 return (PyObject*) self;
5362 }
5363
5364 marg = width - self->length;
5365 left = marg / 2 + (marg & width & 1);
5366
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005367 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368}
5369
Marc-André Lemburge5034372000-08-08 08:04:29 +00005370#if 0
5371
5372/* This code should go into some future Unicode collation support
5373 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005374 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005375
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005376/* speedy UTF-16 code point order comparison */
5377/* gleaned from: */
5378/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5379
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005380static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005381{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005382 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005383 0, 0, 0, 0, 0, 0, 0, 0,
5384 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005385 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005386};
5387
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388static int
5389unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5390{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005391 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005392
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393 Py_UNICODE *s1 = str1->str;
5394 Py_UNICODE *s2 = str2->str;
5395
5396 len1 = str1->length;
5397 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005398
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005400 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005401
5402 c1 = *s1++;
5403 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005404
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005405 if (c1 > (1<<11) * 26)
5406 c1 += utf16Fixup[c1>>11];
5407 if (c2 > (1<<11) * 26)
5408 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005409 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005410
5411 if (c1 != c2)
5412 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005413
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005414 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 }
5416
5417 return (len1 < len2) ? -1 : (len1 != len2);
5418}
5419
Marc-André Lemburge5034372000-08-08 08:04:29 +00005420#else
5421
5422static int
5423unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5424{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005425 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005426
5427 Py_UNICODE *s1 = str1->str;
5428 Py_UNICODE *s2 = str2->str;
5429
5430 len1 = str1->length;
5431 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005432
Marc-André Lemburge5034372000-08-08 08:04:29 +00005433 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005434 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005435
Fredrik Lundh45714e92001-06-26 16:39:36 +00005436 c1 = *s1++;
5437 c2 = *s2++;
5438
5439 if (c1 != c2)
5440 return (c1 < c2) ? -1 : 1;
5441
Marc-André Lemburge5034372000-08-08 08:04:29 +00005442 len1--; len2--;
5443 }
5444
5445 return (len1 < len2) ? -1 : (len1 != len2);
5446}
5447
5448#endif
5449
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450int PyUnicode_Compare(PyObject *left,
5451 PyObject *right)
5452{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005453 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5454 return unicode_compare((PyUnicodeObject *)left,
5455 (PyUnicodeObject *)right);
5456 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5457 (PyUnicode_Check(left) && PyString_Check(right))) {
5458 if (PyUnicode_Check(left))
5459 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5460 if (PyUnicode_Check(right))
5461 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5462 assert(PyString_Check(left));
5463 assert(PyString_Check(right));
5464 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005466 PyErr_Format(PyExc_TypeError,
5467 "Can't compare %.100s and %.100s",
5468 left->ob_type->tp_name,
5469 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 return -1;
5471}
5472
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005473PyObject *PyUnicode_RichCompare(PyObject *left,
5474 PyObject *right,
5475 int op)
5476{
5477 int result;
5478
5479 result = PyUnicode_Compare(left, right);
5480 if (result == -1 && PyErr_Occurred())
5481 goto onError;
5482
5483 /* Convert the return value to a Boolean */
5484 switch (op) {
5485 case Py_EQ:
5486 result = (result == 0);
5487 break;
5488 case Py_NE:
5489 result = (result != 0);
5490 break;
5491 case Py_LE:
5492 result = (result <= 0);
5493 break;
5494 case Py_GE:
5495 result = (result >= 0);
5496 break;
5497 case Py_LT:
5498 result = (result == -1);
5499 break;
5500 case Py_GT:
5501 result = (result == 1);
5502 break;
5503 }
5504 return PyBool_FromLong(result);
5505
5506 onError:
5507
5508 /* Standard case
5509
5510 Type errors mean that PyUnicode_FromObject() could not convert
5511 one of the arguments (usually the right hand side) to Unicode,
5512 ie. we can't handle the comparison request. However, it is
5513 possible that the other object knows a comparison method, which
5514 is why we return Py_NotImplemented to give the other object a
5515 chance.
5516
5517 */
5518 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5519 PyErr_Clear();
5520 Py_INCREF(Py_NotImplemented);
5521 return Py_NotImplemented;
5522 }
5523 if (op != Py_EQ && op != Py_NE)
5524 return NULL;
5525
5526 /* Equality comparison.
5527
5528 This is a special case: we silence any PyExc_UnicodeDecodeError
5529 and instead turn it into a PyErr_UnicodeWarning.
5530
5531 */
5532 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5533 return NULL;
5534 PyErr_Clear();
5535 if (PyErr_Warn(PyExc_UnicodeWarning,
5536 (op == Py_EQ) ?
5537 "Unicode equal comparison "
5538 "failed to convert both arguments to Unicode - "
5539 "interpreting them as being unequal" :
5540 "Unicode unequal comparison "
5541 "failed to convert both arguments to Unicode - "
5542 "interpreting them as being unequal"
5543 ) < 0)
5544 return NULL;
5545 result = (op == Py_NE);
5546 return PyBool_FromLong(result);
5547}
5548
Guido van Rossum403d68b2000-03-13 15:55:09 +00005549int PyUnicode_Contains(PyObject *container,
5550 PyObject *element)
5551{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005552 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005553 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005554
5555 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005556 sub = PyUnicode_FromObject(element);
5557 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005558 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005559 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005560 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005561 }
5562
Thomas Wouters477c8d52006-05-27 19:21:47 +00005563 str = PyUnicode_FromObject(container);
5564 if (!str) {
5565 Py_DECREF(sub);
5566 return -1;
5567 }
5568
5569 result = stringlib_contains_obj(str, sub);
5570
5571 Py_DECREF(str);
5572 Py_DECREF(sub);
5573
Guido van Rossum403d68b2000-03-13 15:55:09 +00005574 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005575}
5576
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577/* Concat to string or Unicode object giving a new Unicode object. */
5578
5579PyObject *PyUnicode_Concat(PyObject *left,
5580 PyObject *right)
5581{
5582 PyUnicodeObject *u = NULL, *v = NULL, *w;
5583
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005584 if (PyBytes_Check(left) || PyBytes_Check(right))
5585 return PyBytes_Concat(left, right);
5586
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 /* Coerce the two arguments */
5588 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5589 if (u == NULL)
5590 goto onError;
5591 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5592 if (v == NULL)
5593 goto onError;
5594
5595 /* Shortcuts */
5596 if (v == unicode_empty) {
5597 Py_DECREF(v);
5598 return (PyObject *)u;
5599 }
5600 if (u == unicode_empty) {
5601 Py_DECREF(u);
5602 return (PyObject *)v;
5603 }
5604
5605 /* Concat the two Unicode strings */
5606 w = _PyUnicode_New(u->length + v->length);
5607 if (w == NULL)
5608 goto onError;
5609 Py_UNICODE_COPY(w->str, u->str, u->length);
5610 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5611
5612 Py_DECREF(u);
5613 Py_DECREF(v);
5614 return (PyObject *)w;
5615
5616onError:
5617 Py_XDECREF(u);
5618 Py_XDECREF(v);
5619 return NULL;
5620}
5621
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005622PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623"S.count(sub[, start[, end]]) -> int\n\
5624\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005625Return the number of non-overlapping occurrences of substring sub in\n\
5626Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005627interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628
5629static PyObject *
5630unicode_count(PyUnicodeObject *self, PyObject *args)
5631{
5632 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005633 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005634 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 PyObject *result;
5636
Guido van Rossumb8872e62000-05-09 14:14:27 +00005637 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5638 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 return NULL;
5640
5641 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00005642 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 if (substring == NULL)
5644 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005645
Thomas Wouters477c8d52006-05-27 19:21:47 +00005646 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647
Thomas Wouters477c8d52006-05-27 19:21:47 +00005648 result = PyInt_FromSsize_t(
5649 stringlib_count(self->str + start, end - start,
5650 substring->str, substring->length)
5651 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652
5653 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005654
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 return result;
5656}
5657
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005658PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005659"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005661Encodes S using the codec registered for encoding. encoding defaults\n\
5662to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005663handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005664a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5665'xmlcharrefreplace' as well as any other name registered with\n\
5666codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667
5668static PyObject *
5669unicode_encode(PyUnicodeObject *self, PyObject *args)
5670{
5671 char *encoding = NULL;
5672 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005673 PyObject *v;
5674
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5676 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005677 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005678 if (v == NULL)
5679 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005680 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00005681 if (PyString_Check(v)) {
5682 /* Old codec, turn it into bytes */
5683 PyObject *b = PyBytes_FromObject(v);
5684 Py_DECREF(v);
5685 return b;
5686 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005687 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00005688 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005689 "(type=%.400s)",
5690 v->ob_type->tp_name);
5691 Py_DECREF(v);
5692 return NULL;
5693 }
5694 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005695
5696 onError:
5697 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005698}
5699
5700PyDoc_STRVAR(decode__doc__,
5701"S.decode([encoding[,errors]]) -> string or unicode\n\
5702\n\
5703Decodes S using the codec registered for encoding. encoding defaults\n\
5704to the default encoding. errors may be given to set a different error\n\
5705handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5706a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5707as well as any other name registerd with codecs.register_error that is\n\
5708able to handle UnicodeDecodeErrors.");
5709
5710static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005711unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005712{
5713 char *encoding = NULL;
5714 char *errors = NULL;
5715 PyObject *v;
5716
5717 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5718 return NULL;
5719 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005720 if (v == NULL)
5721 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005722 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5723 PyErr_Format(PyExc_TypeError,
5724 "decoder did not return a string/unicode object "
5725 "(type=%.400s)",
5726 v->ob_type->tp_name);
5727 Py_DECREF(v);
5728 return NULL;
5729 }
5730 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005731
5732 onError:
5733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734}
5735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005736PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737"S.expandtabs([tabsize]) -> unicode\n\
5738\n\
5739Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005740If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741
5742static PyObject*
5743unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5744{
5745 Py_UNICODE *e;
5746 Py_UNICODE *p;
5747 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005748 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 PyUnicodeObject *u;
5750 int tabsize = 8;
5751
5752 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5753 return NULL;
5754
Thomas Wouters7e474022000-07-16 12:04:32 +00005755 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 i = j = 0;
5757 e = self->str + self->length;
5758 for (p = self->str; p < e; p++)
5759 if (*p == '\t') {
5760 if (tabsize > 0)
5761 j += tabsize - (j % tabsize);
5762 }
5763 else {
5764 j++;
5765 if (*p == '\n' || *p == '\r') {
5766 i += j;
5767 j = 0;
5768 }
5769 }
5770
5771 /* Second pass: create output string and fill it */
5772 u = _PyUnicode_New(i + j);
5773 if (!u)
5774 return NULL;
5775
5776 j = 0;
5777 q = u->str;
5778
5779 for (p = self->str; p < e; p++)
5780 if (*p == '\t') {
5781 if (tabsize > 0) {
5782 i = tabsize - (j % tabsize);
5783 j += i;
5784 while (i--)
5785 *q++ = ' ';
5786 }
5787 }
5788 else {
5789 j++;
5790 *q++ = *p;
5791 if (*p == '\n' || *p == '\r')
5792 j = 0;
5793 }
5794
5795 return (PyObject*) u;
5796}
5797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005798PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799"S.find(sub [,start [,end]]) -> int\n\
5800\n\
5801Return the lowest index in S where substring sub is found,\n\
5802such that sub is contained within s[start,end]. Optional\n\
5803arguments start and end are interpreted as in slice notation.\n\
5804\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005805Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806
5807static PyObject *
5808unicode_find(PyUnicodeObject *self, PyObject *args)
5809{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005810 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005811 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005812 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005813 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814
Guido van Rossumb8872e62000-05-09 14:14:27 +00005815 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5816 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005818 substring = PyUnicode_FromObject(substring);
5819 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820 return NULL;
5821
Thomas Wouters477c8d52006-05-27 19:21:47 +00005822 result = stringlib_find_slice(
5823 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5824 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5825 start, end
5826 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827
5828 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005829
5830 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831}
5832
5833static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005834unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835{
5836 if (index < 0 || index >= self->length) {
5837 PyErr_SetString(PyExc_IndexError, "string index out of range");
5838 return NULL;
5839 }
5840
5841 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5842}
5843
5844static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005845unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005847 /* Since Unicode objects compare equal to their UTF-8 string
5848 counterparts, we hash the UTF-8 string. */
5849 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
5850 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851}
5852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005853PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854"S.index(sub [,start [,end]]) -> int\n\
5855\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005856Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857
5858static PyObject *
5859unicode_index(PyUnicodeObject *self, PyObject *args)
5860{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005861 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005862 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005863 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005864 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865
Guido van Rossumb8872e62000-05-09 14:14:27 +00005866 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5867 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005869 substring = PyUnicode_FromObject(substring);
5870 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 return NULL;
5872
Thomas Wouters477c8d52006-05-27 19:21:47 +00005873 result = stringlib_find_slice(
5874 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5875 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5876 start, end
5877 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878
5879 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005880
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 if (result < 0) {
5882 PyErr_SetString(PyExc_ValueError, "substring not found");
5883 return NULL;
5884 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005885
Martin v. Löwis18e16552006-02-15 17:27:45 +00005886 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887}
5888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005889PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005890"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005892Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005893at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894
5895static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005896unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897{
5898 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5899 register const Py_UNICODE *e;
5900 int cased;
5901
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 /* Shortcut for single character strings */
5903 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005904 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005906 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005907 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005908 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005909
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 e = p + PyUnicode_GET_SIZE(self);
5911 cased = 0;
5912 for (; p < e; p++) {
5913 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005914
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005916 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 else if (!cased && Py_UNICODE_ISLOWER(ch))
5918 cased = 1;
5919 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005920 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921}
5922
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005923PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005924"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005926Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005927at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928
5929static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005930unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931{
5932 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5933 register const Py_UNICODE *e;
5934 int cased;
5935
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 /* Shortcut for single character strings */
5937 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005938 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005940 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005941 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005942 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005943
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 e = p + PyUnicode_GET_SIZE(self);
5945 cased = 0;
5946 for (; p < e; p++) {
5947 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005948
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005950 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 else if (!cased && Py_UNICODE_ISUPPER(ch))
5952 cased = 1;
5953 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005954 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955}
5956
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005957PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005958"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005960Return True if S is a titlecased string and there is at least one\n\
5961character in S, i.e. upper- and titlecase characters may only\n\
5962follow uncased characters and lowercase characters only cased ones.\n\
5963Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964
5965static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005966unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967{
5968 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5969 register const Py_UNICODE *e;
5970 int cased, previous_is_cased;
5971
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 /* Shortcut for single character strings */
5973 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005974 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5975 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005977 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005978 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005979 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005980
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 e = p + PyUnicode_GET_SIZE(self);
5982 cased = 0;
5983 previous_is_cased = 0;
5984 for (; p < e; p++) {
5985 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005986
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5988 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005989 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 previous_is_cased = 1;
5991 cased = 1;
5992 }
5993 else if (Py_UNICODE_ISLOWER(ch)) {
5994 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005995 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 previous_is_cased = 1;
5997 cased = 1;
5998 }
5999 else
6000 previous_is_cased = 0;
6001 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006002 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003}
6004
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006005PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006006"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006008Return True if all characters in S are whitespace\n\
6009and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010
6011static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006012unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013{
6014 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6015 register const Py_UNICODE *e;
6016
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 /* Shortcut for single character strings */
6018 if (PyUnicode_GET_SIZE(self) == 1 &&
6019 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006020 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006022 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006023 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006024 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006025
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 e = p + PyUnicode_GET_SIZE(self);
6027 for (; p < e; p++) {
6028 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006029 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006031 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032}
6033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006034PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006035"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006036\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006037Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006038and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006039
6040static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006041unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006042{
6043 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6044 register const Py_UNICODE *e;
6045
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006046 /* Shortcut for single character strings */
6047 if (PyUnicode_GET_SIZE(self) == 1 &&
6048 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006049 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006050
6051 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006052 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006053 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006054
6055 e = p + PyUnicode_GET_SIZE(self);
6056 for (; p < e; p++) {
6057 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006058 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006059 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006060 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006061}
6062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006063PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006064"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006065\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006066Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006067and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006068
6069static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006070unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006071{
6072 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6073 register const Py_UNICODE *e;
6074
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006075 /* Shortcut for single character strings */
6076 if (PyUnicode_GET_SIZE(self) == 1 &&
6077 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006078 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006079
6080 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006081 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006082 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006083
6084 e = p + PyUnicode_GET_SIZE(self);
6085 for (; p < e; p++) {
6086 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006087 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006088 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006089 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006090}
6091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006092PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006093"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006095Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006096False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097
6098static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006099unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100{
6101 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6102 register const Py_UNICODE *e;
6103
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 /* Shortcut for single character strings */
6105 if (PyUnicode_GET_SIZE(self) == 1 &&
6106 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006107 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006109 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006110 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006111 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006112
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 e = p + PyUnicode_GET_SIZE(self);
6114 for (; p < e; p++) {
6115 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006116 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006118 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119}
6120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006121PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006122"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006124Return True if all characters in S are digits\n\
6125and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126
6127static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006128unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129{
6130 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6131 register const Py_UNICODE *e;
6132
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 /* Shortcut for single character strings */
6134 if (PyUnicode_GET_SIZE(self) == 1 &&
6135 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006136 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006138 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006139 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006140 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006141
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 e = p + PyUnicode_GET_SIZE(self);
6143 for (; p < e; p++) {
6144 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006145 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006147 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148}
6149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006150PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006151"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006153Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006154False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155
6156static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006157unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158{
6159 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6160 register const Py_UNICODE *e;
6161
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 /* Shortcut for single character strings */
6163 if (PyUnicode_GET_SIZE(self) == 1 &&
6164 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006165 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006167 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006168 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006169 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006170
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 e = p + PyUnicode_GET_SIZE(self);
6172 for (; p < e; p++) {
6173 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006174 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006176 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177}
6178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006179PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180"S.join(sequence) -> unicode\n\
6181\n\
6182Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006183sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184
6185static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006186unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006188 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189}
6190
Martin v. Löwis18e16552006-02-15 17:27:45 +00006191static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192unicode_length(PyUnicodeObject *self)
6193{
6194 return self->length;
6195}
6196
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006197PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006198"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199\n\
6200Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006201done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202
6203static PyObject *
6204unicode_ljust(PyUnicodeObject *self, PyObject *args)
6205{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006206 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006207 Py_UNICODE fillchar = ' ';
6208
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006209 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210 return NULL;
6211
Tim Peters7a29bd52001-09-12 03:03:31 +00006212 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 Py_INCREF(self);
6214 return (PyObject*) self;
6215 }
6216
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006217 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218}
6219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006220PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221"S.lower() -> unicode\n\
6222\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006223Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224
6225static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006226unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 return fixup(self, fixlower);
6229}
6230
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006231#define LEFTSTRIP 0
6232#define RIGHTSTRIP 1
6233#define BOTHSTRIP 2
6234
6235/* Arrays indexed by above */
6236static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6237
6238#define STRIPNAME(i) (stripformat[i]+3)
6239
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006240/* externally visible for str.strip(unicode) */
6241PyObject *
6242_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6243{
6244 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006245 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006246 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006247 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6248 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006249
Thomas Wouters477c8d52006-05-27 19:21:47 +00006250 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6251
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006252 i = 0;
6253 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006254 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6255 i++;
6256 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006257 }
6258
6259 j = len;
6260 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006261 do {
6262 j--;
6263 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6264 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006265 }
6266
6267 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006268 Py_INCREF(self);
6269 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006270 }
6271 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006272 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006273}
6274
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275
6276static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006277do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006279 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006280 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006281
6282 i = 0;
6283 if (striptype != RIGHTSTRIP) {
6284 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6285 i++;
6286 }
6287 }
6288
6289 j = len;
6290 if (striptype != LEFTSTRIP) {
6291 do {
6292 j--;
6293 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6294 j++;
6295 }
6296
6297 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6298 Py_INCREF(self);
6299 return (PyObject*)self;
6300 }
6301 else
6302 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303}
6304
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006305
6306static PyObject *
6307do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6308{
6309 PyObject *sep = NULL;
6310
6311 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6312 return NULL;
6313
6314 if (sep != NULL && sep != Py_None) {
6315 if (PyUnicode_Check(sep))
6316 return _PyUnicode_XStrip(self, striptype, sep);
6317 else if (PyString_Check(sep)) {
6318 PyObject *res;
6319 sep = PyUnicode_FromObject(sep);
6320 if (sep==NULL)
6321 return NULL;
6322 res = _PyUnicode_XStrip(self, striptype, sep);
6323 Py_DECREF(sep);
6324 return res;
6325 }
6326 else {
6327 PyErr_Format(PyExc_TypeError,
6328 "%s arg must be None, unicode or str",
6329 STRIPNAME(striptype));
6330 return NULL;
6331 }
6332 }
6333
6334 return do_strip(self, striptype);
6335}
6336
6337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006338PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006339"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006340\n\
6341Return a copy of the string S with leading and trailing\n\
6342whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006343If chars is given and not None, remove characters in chars instead.\n\
6344If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006345
6346static PyObject *
6347unicode_strip(PyUnicodeObject *self, PyObject *args)
6348{
6349 if (PyTuple_GET_SIZE(args) == 0)
6350 return do_strip(self, BOTHSTRIP); /* Common case */
6351 else
6352 return do_argstrip(self, BOTHSTRIP, args);
6353}
6354
6355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006356PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006357"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006358\n\
6359Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006360If chars is given and not None, remove characters in chars instead.\n\
6361If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006362
6363static PyObject *
6364unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6365{
6366 if (PyTuple_GET_SIZE(args) == 0)
6367 return do_strip(self, LEFTSTRIP); /* Common case */
6368 else
6369 return do_argstrip(self, LEFTSTRIP, args);
6370}
6371
6372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006373PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006374"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006375\n\
6376Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006377If chars is given and not None, remove characters in chars instead.\n\
6378If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006379
6380static PyObject *
6381unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6382{
6383 if (PyTuple_GET_SIZE(args) == 0)
6384 return do_strip(self, RIGHTSTRIP); /* Common case */
6385 else
6386 return do_argstrip(self, RIGHTSTRIP, args);
6387}
6388
6389
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006391unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392{
6393 PyUnicodeObject *u;
6394 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006395 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006396 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397
6398 if (len < 0)
6399 len = 0;
6400
Tim Peters7a29bd52001-09-12 03:03:31 +00006401 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 /* no repeat, return original string */
6403 Py_INCREF(str);
6404 return (PyObject*) str;
6405 }
Tim Peters8f422462000-09-09 06:13:41 +00006406
6407 /* ensure # of chars needed doesn't overflow int and # of bytes
6408 * needed doesn't overflow size_t
6409 */
6410 nchars = len * str->length;
6411 if (len && nchars / len != str->length) {
6412 PyErr_SetString(PyExc_OverflowError,
6413 "repeated string is too long");
6414 return NULL;
6415 }
6416 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6417 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6418 PyErr_SetString(PyExc_OverflowError,
6419 "repeated string is too long");
6420 return NULL;
6421 }
6422 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 if (!u)
6424 return NULL;
6425
6426 p = u->str;
6427
Thomas Wouters477c8d52006-05-27 19:21:47 +00006428 if (str->length == 1 && len > 0) {
6429 Py_UNICODE_FILL(p, str->str[0], len);
6430 } else {
6431 Py_ssize_t done = 0; /* number of characters copied this far */
6432 if (done < nchars) {
6433 Py_UNICODE_COPY(p, str->str, str->length);
6434 done = str->length;
6435 }
6436 while (done < nchars) {
6437 int n = (done <= nchars-done) ? done : nchars-done;
6438 Py_UNICODE_COPY(p+done, p, n);
6439 done += n;
6440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 }
6442
6443 return (PyObject*) u;
6444}
6445
6446PyObject *PyUnicode_Replace(PyObject *obj,
6447 PyObject *subobj,
6448 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006449 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450{
6451 PyObject *self;
6452 PyObject *str1;
6453 PyObject *str2;
6454 PyObject *result;
6455
6456 self = PyUnicode_FromObject(obj);
6457 if (self == NULL)
6458 return NULL;
6459 str1 = PyUnicode_FromObject(subobj);
6460 if (str1 == NULL) {
6461 Py_DECREF(self);
6462 return NULL;
6463 }
6464 str2 = PyUnicode_FromObject(replobj);
6465 if (str2 == NULL) {
6466 Py_DECREF(self);
6467 Py_DECREF(str1);
6468 return NULL;
6469 }
Tim Petersced69f82003-09-16 20:30:58 +00006470 result = replace((PyUnicodeObject *)self,
6471 (PyUnicodeObject *)str1,
6472 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 maxcount);
6474 Py_DECREF(self);
6475 Py_DECREF(str1);
6476 Py_DECREF(str2);
6477 return result;
6478}
6479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006480PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481"S.replace (old, new[, maxsplit]) -> unicode\n\
6482\n\
6483Return a copy of S with all occurrences of substring\n\
6484old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006485given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486
6487static PyObject*
6488unicode_replace(PyUnicodeObject *self, PyObject *args)
6489{
6490 PyUnicodeObject *str1;
6491 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006492 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 PyObject *result;
6494
Martin v. Löwis18e16552006-02-15 17:27:45 +00006495 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 return NULL;
6497 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6498 if (str1 == NULL)
6499 return NULL;
6500 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006501 if (str2 == NULL) {
6502 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006504 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505
6506 result = replace(self, str1, str2, maxcount);
6507
6508 Py_DECREF(str1);
6509 Py_DECREF(str2);
6510 return result;
6511}
6512
6513static
6514PyObject *unicode_repr(PyObject *unicode)
6515{
6516 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6517 PyUnicode_GET_SIZE(unicode),
6518 1);
6519}
6520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006521PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522"S.rfind(sub [,start [,end]]) -> int\n\
6523\n\
6524Return the highest index in S where substring sub is found,\n\
6525such that sub is contained within s[start,end]. Optional\n\
6526arguments start and end are interpreted as in slice notation.\n\
6527\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006528Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529
6530static PyObject *
6531unicode_rfind(PyUnicodeObject *self, PyObject *args)
6532{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006533 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006534 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006535 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006536 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537
Guido van Rossumb8872e62000-05-09 14:14:27 +00006538 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6539 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006541 substring = PyUnicode_FromObject(substring);
6542 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 return NULL;
6544
Thomas Wouters477c8d52006-05-27 19:21:47 +00006545 result = stringlib_rfind_slice(
6546 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6547 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6548 start, end
6549 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550
6551 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006552
6553 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554}
6555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006556PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557"S.rindex(sub [,start [,end]]) -> int\n\
6558\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006559Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560
6561static PyObject *
6562unicode_rindex(PyUnicodeObject *self, PyObject *args)
6563{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006564 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006565 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006566 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006567 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568
Guido van Rossumb8872e62000-05-09 14:14:27 +00006569 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6570 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006572 substring = PyUnicode_FromObject(substring);
6573 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 return NULL;
6575
Thomas Wouters477c8d52006-05-27 19:21:47 +00006576 result = stringlib_rfind_slice(
6577 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6578 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6579 start, end
6580 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581
6582 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006583
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 if (result < 0) {
6585 PyErr_SetString(PyExc_ValueError, "substring not found");
6586 return NULL;
6587 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006588 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589}
6590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006591PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006592"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593\n\
6594Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006595done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
6597static PyObject *
6598unicode_rjust(PyUnicodeObject *self, PyObject *args)
6599{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006600 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006601 Py_UNICODE fillchar = ' ';
6602
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006603 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 return NULL;
6605
Tim Peters7a29bd52001-09-12 03:03:31 +00006606 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 Py_INCREF(self);
6608 return (PyObject*) self;
6609 }
6610
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006611 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612}
6613
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006615unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616{
6617 /* standard clamping */
6618 if (start < 0)
6619 start = 0;
6620 if (end < 0)
6621 end = 0;
6622 if (end > self->length)
6623 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006624 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 /* full slice, return original string */
6626 Py_INCREF(self);
6627 return (PyObject*) self;
6628 }
6629 if (start > end)
6630 start = end;
6631 /* copy slice */
6632 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6633 end - start);
6634}
6635
6636PyObject *PyUnicode_Split(PyObject *s,
6637 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006638 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639{
6640 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006641
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 s = PyUnicode_FromObject(s);
6643 if (s == NULL)
6644 return NULL;
6645 if (sep != NULL) {
6646 sep = PyUnicode_FromObject(sep);
6647 if (sep == NULL) {
6648 Py_DECREF(s);
6649 return NULL;
6650 }
6651 }
6652
6653 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6654
6655 Py_DECREF(s);
6656 Py_XDECREF(sep);
6657 return result;
6658}
6659
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006660PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661"S.split([sep [,maxsplit]]) -> list of strings\n\
6662\n\
6663Return a list of the words in S, using sep as the\n\
6664delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006665splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006666any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667
6668static PyObject*
6669unicode_split(PyUnicodeObject *self, PyObject *args)
6670{
6671 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006672 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673
Martin v. Löwis18e16552006-02-15 17:27:45 +00006674 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675 return NULL;
6676
6677 if (substring == Py_None)
6678 return split(self, NULL, maxcount);
6679 else if (PyUnicode_Check(substring))
6680 return split(self, (PyUnicodeObject *)substring, maxcount);
6681 else
6682 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6683}
6684
Thomas Wouters477c8d52006-05-27 19:21:47 +00006685PyObject *
6686PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6687{
6688 PyObject* str_obj;
6689 PyObject* sep_obj;
6690 PyObject* out;
6691
6692 str_obj = PyUnicode_FromObject(str_in);
6693 if (!str_obj)
6694 return NULL;
6695 sep_obj = PyUnicode_FromObject(sep_in);
6696 if (!sep_obj) {
6697 Py_DECREF(str_obj);
6698 return NULL;
6699 }
6700
6701 out = stringlib_partition(
6702 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6703 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6704 );
6705
6706 Py_DECREF(sep_obj);
6707 Py_DECREF(str_obj);
6708
6709 return out;
6710}
6711
6712
6713PyObject *
6714PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6715{
6716 PyObject* str_obj;
6717 PyObject* sep_obj;
6718 PyObject* out;
6719
6720 str_obj = PyUnicode_FromObject(str_in);
6721 if (!str_obj)
6722 return NULL;
6723 sep_obj = PyUnicode_FromObject(sep_in);
6724 if (!sep_obj) {
6725 Py_DECREF(str_obj);
6726 return NULL;
6727 }
6728
6729 out = stringlib_rpartition(
6730 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6731 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6732 );
6733
6734 Py_DECREF(sep_obj);
6735 Py_DECREF(str_obj);
6736
6737 return out;
6738}
6739
6740PyDoc_STRVAR(partition__doc__,
6741"S.partition(sep) -> (head, sep, tail)\n\
6742\n\
6743Searches for the separator sep in S, and returns the part before it,\n\
6744the separator itself, and the part after it. If the separator is not\n\
6745found, returns S and two empty strings.");
6746
6747static PyObject*
6748unicode_partition(PyUnicodeObject *self, PyObject *separator)
6749{
6750 return PyUnicode_Partition((PyObject *)self, separator);
6751}
6752
6753PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00006754"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006755\n\
6756Searches for the separator sep in S, starting at the end of S, and returns\n\
6757the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00006758separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00006759
6760static PyObject*
6761unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6762{
6763 return PyUnicode_RPartition((PyObject *)self, separator);
6764}
6765
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006766PyObject *PyUnicode_RSplit(PyObject *s,
6767 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006768 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006769{
6770 PyObject *result;
6771
6772 s = PyUnicode_FromObject(s);
6773 if (s == NULL)
6774 return NULL;
6775 if (sep != NULL) {
6776 sep = PyUnicode_FromObject(sep);
6777 if (sep == NULL) {
6778 Py_DECREF(s);
6779 return NULL;
6780 }
6781 }
6782
6783 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6784
6785 Py_DECREF(s);
6786 Py_XDECREF(sep);
6787 return result;
6788}
6789
6790PyDoc_STRVAR(rsplit__doc__,
6791"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6792\n\
6793Return a list of the words in S, using sep as the\n\
6794delimiter string, starting at the end of the string and\n\
6795working to the front. If maxsplit is given, at most maxsplit\n\
6796splits are done. If sep is not specified, any whitespace string\n\
6797is a separator.");
6798
6799static PyObject*
6800unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6801{
6802 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006803 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006804
Martin v. Löwis18e16552006-02-15 17:27:45 +00006805 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006806 return NULL;
6807
6808 if (substring == Py_None)
6809 return rsplit(self, NULL, maxcount);
6810 else if (PyUnicode_Check(substring))
6811 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6812 else
6813 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6814}
6815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006816PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006817"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818\n\
6819Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006820Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006821is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822
6823static PyObject*
6824unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6825{
Guido van Rossum86662912000-04-11 15:38:46 +00006826 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827
Guido van Rossum86662912000-04-11 15:38:46 +00006828 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829 return NULL;
6830
Guido van Rossum86662912000-04-11 15:38:46 +00006831 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832}
6833
6834static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006835PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006837 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
6838 Py_XINCREF(res);
6839 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840}
6841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006842PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843"S.swapcase() -> unicode\n\
6844\n\
6845Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006846and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847
6848static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006849unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851 return fixup(self, fixswapcase);
6852}
6853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006854PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855"S.translate(table) -> unicode\n\
6856\n\
6857Return a copy of the string S, where all characters have been mapped\n\
6858through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006859Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6860Unmapped characters are left untouched. Characters mapped to None\n\
6861are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
6863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006864unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865{
Tim Petersced69f82003-09-16 20:30:58 +00006866 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006868 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 "ignore");
6870}
6871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006872PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873"S.upper() -> unicode\n\
6874\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006875Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876
6877static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006878unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 return fixup(self, fixupper);
6881}
6882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006883PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884"S.zfill(width) -> unicode\n\
6885\n\
6886Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006887of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888
6889static PyObject *
6890unicode_zfill(PyUnicodeObject *self, PyObject *args)
6891{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006892 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 PyUnicodeObject *u;
6894
Martin v. Löwis18e16552006-02-15 17:27:45 +00006895 Py_ssize_t width;
6896 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897 return NULL;
6898
6899 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006900 if (PyUnicode_CheckExact(self)) {
6901 Py_INCREF(self);
6902 return (PyObject*) self;
6903 }
6904 else
6905 return PyUnicode_FromUnicode(
6906 PyUnicode_AS_UNICODE(self),
6907 PyUnicode_GET_SIZE(self)
6908 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 }
6910
6911 fill = width - self->length;
6912
6913 u = pad(self, fill, 0, '0');
6914
Walter Dörwald068325e2002-04-15 13:36:47 +00006915 if (u == NULL)
6916 return NULL;
6917
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918 if (u->str[fill] == '+' || u->str[fill] == '-') {
6919 /* move sign to beginning of string */
6920 u->str[0] = u->str[fill];
6921 u->str[fill] = '0';
6922 }
6923
6924 return (PyObject*) u;
6925}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926
6927#if 0
6928static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006929unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 return PyInt_FromLong(unicode_freelist_size);
6932}
6933#endif
6934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006935PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006936"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006938Return True if S starts with the specified prefix, False otherwise.\n\
6939With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006940With optional end, stop comparing S at that position.\n\
6941prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942
6943static PyObject *
6944unicode_startswith(PyUnicodeObject *self,
6945 PyObject *args)
6946{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006947 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006949 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006950 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006951 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006953 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006954 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006956 if (PyTuple_Check(subobj)) {
6957 Py_ssize_t i;
6958 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6959 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6960 PyTuple_GET_ITEM(subobj, i));
6961 if (substring == NULL)
6962 return NULL;
6963 result = tailmatch(self, substring, start, end, -1);
6964 Py_DECREF(substring);
6965 if (result) {
6966 Py_RETURN_TRUE;
6967 }
6968 }
6969 /* nothing matched */
6970 Py_RETURN_FALSE;
6971 }
6972 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006974 return NULL;
6975 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006977 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978}
6979
6980
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006981PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006982"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006984Return True if S ends with the specified suffix, False otherwise.\n\
6985With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006986With optional end, stop comparing S at that position.\n\
6987suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988
6989static PyObject *
6990unicode_endswith(PyUnicodeObject *self,
6991 PyObject *args)
6992{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006993 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006995 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006996 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006997 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006999 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7000 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007002 if (PyTuple_Check(subobj)) {
7003 Py_ssize_t i;
7004 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7005 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7006 PyTuple_GET_ITEM(subobj, i));
7007 if (substring == NULL)
7008 return NULL;
7009 result = tailmatch(self, substring, start, end, +1);
7010 Py_DECREF(substring);
7011 if (result) {
7012 Py_RETURN_TRUE;
7013 }
7014 }
7015 Py_RETURN_FALSE;
7016 }
7017 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007021 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007023 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024}
7025
7026
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007027
7028static PyObject *
7029unicode_getnewargs(PyUnicodeObject *v)
7030{
7031 return Py_BuildValue("(u#)", v->str, v->length);
7032}
7033
7034
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035static PyMethodDef unicode_methods[] = {
7036
7037 /* Order is according to common usage: often used methods should
7038 appear first, since lookup is done sequentially. */
7039
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007040 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7041 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7042 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007043 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007044 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7045 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7046 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7047 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7048 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7049 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7050 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007051 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007052 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7053 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7054 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007055 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007056 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007057/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7058 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7059 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7060 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007061 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007062 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007063 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007064 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007065 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7066 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7067 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7068 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7069 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7070 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7071 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7072 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7073 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7074 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7075 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7076 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7077 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7078 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007079 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007080#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007081 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082#endif
7083
7084#if 0
7085 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007086 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087#endif
7088
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007089 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 {NULL, NULL}
7091};
7092
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007093static PyObject *
7094unicode_mod(PyObject *v, PyObject *w)
7095{
7096 if (!PyUnicode_Check(v)) {
7097 Py_INCREF(Py_NotImplemented);
7098 return Py_NotImplemented;
7099 }
7100 return PyUnicode_Format(v, w);
7101}
7102
7103static PyNumberMethods unicode_as_number = {
7104 0, /*nb_add*/
7105 0, /*nb_subtract*/
7106 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007107 unicode_mod, /*nb_remainder*/
7108};
7109
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007111 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007112 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007113 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7114 (ssizeargfunc) unicode_getitem, /* sq_item */
7115 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116 0, /* sq_ass_item */
7117 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007118 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119};
7120
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007121static PyObject*
7122unicode_subscript(PyUnicodeObject* self, PyObject* item)
7123{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007124 if (PyIndex_Check(item)) {
7125 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007126 if (i == -1 && PyErr_Occurred())
7127 return NULL;
7128 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007129 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007130 return unicode_getitem(self, i);
7131 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007132 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007133 Py_UNICODE* source_buf;
7134 Py_UNICODE* result_buf;
7135 PyObject* result;
7136
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007137 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007138 &start, &stop, &step, &slicelength) < 0) {
7139 return NULL;
7140 }
7141
7142 if (slicelength <= 0) {
7143 return PyUnicode_FromUnicode(NULL, 0);
7144 } else {
7145 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007146 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7147 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007148
7149 if (result_buf == NULL)
7150 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007151
7152 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7153 result_buf[i] = source_buf[cur];
7154 }
Tim Petersced69f82003-09-16 20:30:58 +00007155
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007156 result = PyUnicode_FromUnicode(result_buf, slicelength);
7157 PyMem_FREE(result_buf);
7158 return result;
7159 }
7160 } else {
7161 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7162 return NULL;
7163 }
7164}
7165
7166static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007167 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007168 (binaryfunc)unicode_subscript, /* mp_subscript */
7169 (objobjargproc)0, /* mp_ass_subscript */
7170};
7171
Martin v. Löwis18e16552006-02-15 17:27:45 +00007172static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007174 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 const void **ptr)
7176{
7177 if (index != 0) {
7178 PyErr_SetString(PyExc_SystemError,
7179 "accessing non-existent unicode segment");
7180 return -1;
7181 }
7182 *ptr = (void *) self->str;
7183 return PyUnicode_GET_DATA_SIZE(self);
7184}
7185
Martin v. Löwis18e16552006-02-15 17:27:45 +00007186static Py_ssize_t
7187unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188 const void **ptr)
7189{
7190 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007191 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192 return -1;
7193}
7194
7195static int
7196unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007197 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198{
7199 if (lenp)
7200 *lenp = PyUnicode_GET_DATA_SIZE(self);
7201 return 1;
7202}
7203
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007204static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007206 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 const void **ptr)
7208{
7209 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007210
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 if (index != 0) {
7212 PyErr_SetString(PyExc_SystemError,
7213 "accessing non-existent unicode segment");
7214 return -1;
7215 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007216 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217 if (str == NULL)
7218 return -1;
7219 *ptr = (void *) PyString_AS_STRING(str);
7220 return PyString_GET_SIZE(str);
7221}
7222
7223/* Helpers for PyUnicode_Format() */
7224
7225static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007226getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007228 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229 if (argidx < arglen) {
7230 (*p_argidx)++;
7231 if (arglen < 0)
7232 return args;
7233 else
7234 return PyTuple_GetItem(args, argidx);
7235 }
7236 PyErr_SetString(PyExc_TypeError,
7237 "not enough arguments for format string");
7238 return NULL;
7239}
7240
7241#define F_LJUST (1<<0)
7242#define F_SIGN (1<<1)
7243#define F_BLANK (1<<2)
7244#define F_ALT (1<<3)
7245#define F_ZERO (1<<4)
7246
Martin v. Löwis18e16552006-02-15 17:27:45 +00007247static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007248strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007250 register Py_ssize_t i;
7251 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 for (i = len - 1; i >= 0; i--)
7253 buffer[i] = (Py_UNICODE) charbuffer[i];
7254
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255 return len;
7256}
7257
Neal Norwitzfc76d632006-01-10 06:03:13 +00007258static int
7259doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7260{
Tim Peters15231542006-02-16 01:08:01 +00007261 Py_ssize_t result;
7262
Neal Norwitzfc76d632006-01-10 06:03:13 +00007263 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007264 result = strtounicode(buffer, (char *)buffer);
7265 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007266}
7267
7268static int
7269longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7270{
Tim Peters15231542006-02-16 01:08:01 +00007271 Py_ssize_t result;
7272
Neal Norwitzfc76d632006-01-10 06:03:13 +00007273 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007274 result = strtounicode(buffer, (char *)buffer);
7275 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007276}
7277
Guido van Rossum078151d2002-08-11 04:24:12 +00007278/* XXX To save some code duplication, formatfloat/long/int could have been
7279 shared with stringobject.c, converting from 8-bit to Unicode after the
7280 formatting is done. */
7281
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282static int
7283formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007284 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 int flags,
7286 int prec,
7287 int type,
7288 PyObject *v)
7289{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007290 /* fmt = '%#.' + `prec` + `type`
7291 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292 char fmt[20];
7293 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007294
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295 x = PyFloat_AsDouble(v);
7296 if (x == -1.0 && PyErr_Occurred())
7297 return -1;
7298 if (prec < 0)
7299 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7301 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007302 /* Worst case length calc to ensure no buffer overrun:
7303
7304 'g' formats:
7305 fmt = %#.<prec>g
7306 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7307 for any double rep.)
7308 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7309
7310 'f' formats:
7311 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7312 len = 1 + 50 + 1 + prec = 52 + prec
7313
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007314 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007315 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007316
7317 */
7318 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7319 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007320 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007321 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007322 return -1;
7323 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007324 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7325 (flags&F_ALT) ? "#" : "",
7326 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007327 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328}
7329
Tim Peters38fd5b62000-09-21 05:43:11 +00007330static PyObject*
7331formatlong(PyObject *val, int flags, int prec, int type)
7332{
7333 char *buf;
7334 int i, len;
7335 PyObject *str; /* temporary string object. */
7336 PyUnicodeObject *result;
7337
7338 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7339 if (!str)
7340 return NULL;
7341 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007342 if (!result) {
7343 Py_DECREF(str);
7344 return NULL;
7345 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007346 for (i = 0; i < len; i++)
7347 result->str[i] = buf[i];
7348 result->str[len] = 0;
7349 Py_DECREF(str);
7350 return (PyObject*)result;
7351}
7352
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353static int
7354formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007355 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356 int flags,
7357 int prec,
7358 int type,
7359 PyObject *v)
7360{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007361 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007362 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7363 * + 1 + 1
7364 * = 24
7365 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007366 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007367 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368 long x;
7369
7370 x = PyInt_AsLong(v);
7371 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007372 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007373 if (x < 0 && type == 'u') {
7374 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007375 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007376 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7377 sign = "-";
7378 else
7379 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007381 prec = 1;
7382
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007383 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7384 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007385 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007386 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007387 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007388 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007389 return -1;
7390 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007391
7392 if ((flags & F_ALT) &&
7393 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007394 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007395 * of issues that cause pain:
7396 * - when 0 is being converted, the C standard leaves off
7397 * the '0x' or '0X', which is inconsistent with other
7398 * %#x/%#X conversions and inconsistent with Python's
7399 * hex() function
7400 * - there are platforms that violate the standard and
7401 * convert 0 with the '0x' or '0X'
7402 * (Metrowerks, Compaq Tru64)
7403 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007404 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007405 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007406 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007407 * We can achieve the desired consistency by inserting our
7408 * own '0x' or '0X' prefix, and substituting %x/%X in place
7409 * of %#x/%#X.
7410 *
7411 * Note that this is the same approach as used in
7412 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007413 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007414 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7415 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007416 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007417 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007418 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7419 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007420 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007421 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007422 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007423 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007424 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007425 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426}
7427
7428static int
7429formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007430 size_t buflen,
7431 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007433 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007434 if (PyUnicode_Check(v)) {
7435 if (PyUnicode_GET_SIZE(v) != 1)
7436 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007438 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007440 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007441 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007442 goto onError;
7443 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7444 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445
7446 else {
7447 /* Integer input truncated to a character */
7448 long x;
7449 x = PyInt_AsLong(v);
7450 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007451 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007452#ifdef Py_UNICODE_WIDE
7453 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007454 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007455 "%c arg not in range(0x110000) "
7456 "(wide Python build)");
7457 return -1;
7458 }
7459#else
7460 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007461 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007462 "%c arg not in range(0x10000) "
7463 "(narrow Python build)");
7464 return -1;
7465 }
7466#endif
7467 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468 }
7469 buf[1] = '\0';
7470 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007471
7472 onError:
7473 PyErr_SetString(PyExc_TypeError,
7474 "%c requires int or char");
7475 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476}
7477
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007478/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7479
7480 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7481 chars are formatted. XXX This is a magic number. Each formatting
7482 routine does bounds checking to ensure no overflow, but a better
7483 solution may be to malloc a buffer of appropriate size for each
7484 format. For now, the current solution is sufficient.
7485*/
7486#define FORMATBUFLEN (size_t)120
7487
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488PyObject *PyUnicode_Format(PyObject *format,
7489 PyObject *args)
7490{
7491 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007492 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 int args_owned = 0;
7494 PyUnicodeObject *result = NULL;
7495 PyObject *dict = NULL;
7496 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007497
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498 if (format == NULL || args == NULL) {
7499 PyErr_BadInternalCall();
7500 return NULL;
7501 }
7502 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007503 if (uformat == NULL)
7504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 fmt = PyUnicode_AS_UNICODE(uformat);
7506 fmtcnt = PyUnicode_GET_SIZE(uformat);
7507
7508 reslen = rescnt = fmtcnt + 100;
7509 result = _PyUnicode_New(reslen);
7510 if (result == NULL)
7511 goto onError;
7512 res = PyUnicode_AS_UNICODE(result);
7513
7514 if (PyTuple_Check(args)) {
7515 arglen = PyTuple_Size(args);
7516 argidx = 0;
7517 }
7518 else {
7519 arglen = -1;
7520 argidx = -2;
7521 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007522 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7523 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 dict = args;
7525
7526 while (--fmtcnt >= 0) {
7527 if (*fmt != '%') {
7528 if (--rescnt < 0) {
7529 rescnt = fmtcnt + 100;
7530 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007531 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007532 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7534 --rescnt;
7535 }
7536 *res++ = *fmt++;
7537 }
7538 else {
7539 /* Got a format specifier */
7540 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007541 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543 Py_UNICODE c = '\0';
7544 Py_UNICODE fill;
7545 PyObject *v = NULL;
7546 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007547 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007549 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007550 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551
7552 fmt++;
7553 if (*fmt == '(') {
7554 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007555 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 PyObject *key;
7557 int pcount = 1;
7558
7559 if (dict == NULL) {
7560 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007561 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 goto onError;
7563 }
7564 ++fmt;
7565 --fmtcnt;
7566 keystart = fmt;
7567 /* Skip over balanced parentheses */
7568 while (pcount > 0 && --fmtcnt >= 0) {
7569 if (*fmt == ')')
7570 --pcount;
7571 else if (*fmt == '(')
7572 ++pcount;
7573 fmt++;
7574 }
7575 keylen = fmt - keystart - 1;
7576 if (fmtcnt < 0 || pcount > 0) {
7577 PyErr_SetString(PyExc_ValueError,
7578 "incomplete format key");
7579 goto onError;
7580 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007581#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007582 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 then looked up since Python uses strings to hold
7584 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007585 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 key = PyUnicode_EncodeUTF8(keystart,
7587 keylen,
7588 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007589#else
7590 key = PyUnicode_FromUnicode(keystart, keylen);
7591#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592 if (key == NULL)
7593 goto onError;
7594 if (args_owned) {
7595 Py_DECREF(args);
7596 args_owned = 0;
7597 }
7598 args = PyObject_GetItem(dict, key);
7599 Py_DECREF(key);
7600 if (args == NULL) {
7601 goto onError;
7602 }
7603 args_owned = 1;
7604 arglen = -1;
7605 argidx = -2;
7606 }
7607 while (--fmtcnt >= 0) {
7608 switch (c = *fmt++) {
7609 case '-': flags |= F_LJUST; continue;
7610 case '+': flags |= F_SIGN; continue;
7611 case ' ': flags |= F_BLANK; continue;
7612 case '#': flags |= F_ALT; continue;
7613 case '0': flags |= F_ZERO; continue;
7614 }
7615 break;
7616 }
7617 if (c == '*') {
7618 v = getnextarg(args, arglen, &argidx);
7619 if (v == NULL)
7620 goto onError;
7621 if (!PyInt_Check(v)) {
7622 PyErr_SetString(PyExc_TypeError,
7623 "* wants int");
7624 goto onError;
7625 }
7626 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007627 if (width == -1 && PyErr_Occurred())
7628 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629 if (width < 0) {
7630 flags |= F_LJUST;
7631 width = -width;
7632 }
7633 if (--fmtcnt >= 0)
7634 c = *fmt++;
7635 }
7636 else if (c >= '0' && c <= '9') {
7637 width = c - '0';
7638 while (--fmtcnt >= 0) {
7639 c = *fmt++;
7640 if (c < '0' || c > '9')
7641 break;
7642 if ((width*10) / 10 != width) {
7643 PyErr_SetString(PyExc_ValueError,
7644 "width too big");
7645 goto onError;
7646 }
7647 width = width*10 + (c - '0');
7648 }
7649 }
7650 if (c == '.') {
7651 prec = 0;
7652 if (--fmtcnt >= 0)
7653 c = *fmt++;
7654 if (c == '*') {
7655 v = getnextarg(args, arglen, &argidx);
7656 if (v == NULL)
7657 goto onError;
7658 if (!PyInt_Check(v)) {
7659 PyErr_SetString(PyExc_TypeError,
7660 "* wants int");
7661 goto onError;
7662 }
7663 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00007664 if (prec == -1 && PyErr_Occurred())
7665 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666 if (prec < 0)
7667 prec = 0;
7668 if (--fmtcnt >= 0)
7669 c = *fmt++;
7670 }
7671 else if (c >= '0' && c <= '9') {
7672 prec = c - '0';
7673 while (--fmtcnt >= 0) {
7674 c = Py_CHARMASK(*fmt++);
7675 if (c < '0' || c > '9')
7676 break;
7677 if ((prec*10) / 10 != prec) {
7678 PyErr_SetString(PyExc_ValueError,
7679 "prec too big");
7680 goto onError;
7681 }
7682 prec = prec*10 + (c - '0');
7683 }
7684 }
7685 } /* prec */
7686 if (fmtcnt >= 0) {
7687 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688 if (--fmtcnt >= 0)
7689 c = *fmt++;
7690 }
7691 }
7692 if (fmtcnt < 0) {
7693 PyErr_SetString(PyExc_ValueError,
7694 "incomplete format");
7695 goto onError;
7696 }
7697 if (c != '%') {
7698 v = getnextarg(args, arglen, &argidx);
7699 if (v == NULL)
7700 goto onError;
7701 }
7702 sign = 0;
7703 fill = ' ';
7704 switch (c) {
7705
7706 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007707 pbuf = formatbuf;
7708 /* presume that buffer length is at least 1 */
7709 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 len = 1;
7711 break;
7712
7713 case 's':
7714 case 'r':
7715 if (PyUnicode_Check(v) && c == 's') {
7716 temp = v;
7717 Py_INCREF(temp);
7718 }
7719 else {
7720 PyObject *unicode;
7721 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007722 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723 else
7724 temp = PyObject_Repr(v);
7725 if (temp == NULL)
7726 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007727 if (PyUnicode_Check(temp))
7728 /* nothing to do */;
7729 else if (PyString_Check(temp)) {
7730 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007731 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007733 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007735 Py_DECREF(temp);
7736 temp = unicode;
7737 if (temp == NULL)
7738 goto onError;
7739 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007740 else {
7741 Py_DECREF(temp);
7742 PyErr_SetString(PyExc_TypeError,
7743 "%s argument has non-string str()");
7744 goto onError;
7745 }
7746 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007747 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 len = PyUnicode_GET_SIZE(temp);
7749 if (prec >= 0 && len > prec)
7750 len = prec;
7751 break;
7752
7753 case 'i':
7754 case 'd':
7755 case 'u':
7756 case 'o':
7757 case 'x':
7758 case 'X':
7759 if (c == 'i')
7760 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007761 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007762 temp = formatlong(v, flags, prec, c);
7763 if (!temp)
7764 goto onError;
7765 pbuf = PyUnicode_AS_UNICODE(temp);
7766 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007767 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007769 else {
7770 pbuf = formatbuf;
7771 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7772 flags, prec, c, v);
7773 if (len < 0)
7774 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007775 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007776 }
7777 if (flags & F_ZERO)
7778 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779 break;
7780
7781 case 'e':
7782 case 'E':
7783 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007784 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785 case 'g':
7786 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007787 if (c == 'F')
7788 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007789 pbuf = formatbuf;
7790 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7791 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 if (len < 0)
7793 goto onError;
7794 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007795 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796 fill = '0';
7797 break;
7798
7799 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007800 pbuf = formatbuf;
7801 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 if (len < 0)
7803 goto onError;
7804 break;
7805
7806 default:
7807 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007808 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00007809 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007810 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007811 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007812 (Py_ssize_t)(fmt - 1 -
7813 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814 goto onError;
7815 }
7816 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007817 if (*pbuf == '-' || *pbuf == '+') {
7818 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819 len--;
7820 }
7821 else if (flags & F_SIGN)
7822 sign = '+';
7823 else if (flags & F_BLANK)
7824 sign = ' ';
7825 else
7826 sign = 0;
7827 }
7828 if (width < len)
7829 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007830 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831 reslen -= rescnt;
7832 rescnt = width + fmtcnt + 100;
7833 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007834 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007835 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007836 PyErr_NoMemory();
7837 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007838 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007839 if (_PyUnicode_Resize(&result, reslen) < 0) {
7840 Py_XDECREF(temp);
7841 goto onError;
7842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843 res = PyUnicode_AS_UNICODE(result)
7844 + reslen - rescnt;
7845 }
7846 if (sign) {
7847 if (fill != ' ')
7848 *res++ = sign;
7849 rescnt--;
7850 if (width > len)
7851 width--;
7852 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007853 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7854 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007855 assert(pbuf[1] == c);
7856 if (fill != ' ') {
7857 *res++ = *pbuf++;
7858 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007859 }
Tim Petersfff53252001-04-12 18:38:48 +00007860 rescnt -= 2;
7861 width -= 2;
7862 if (width < 0)
7863 width = 0;
7864 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007865 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 if (width > len && !(flags & F_LJUST)) {
7867 do {
7868 --rescnt;
7869 *res++ = fill;
7870 } while (--width > len);
7871 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007872 if (fill == ' ') {
7873 if (sign)
7874 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007875 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007876 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007877 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007878 *res++ = *pbuf++;
7879 *res++ = *pbuf++;
7880 }
7881 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007882 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883 res += len;
7884 rescnt -= len;
7885 while (--width >= len) {
7886 --rescnt;
7887 *res++ = ' ';
7888 }
7889 if (dict && (argidx < arglen) && c != '%') {
7890 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007891 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007892 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893 goto onError;
7894 }
7895 Py_XDECREF(temp);
7896 } /* '%' */
7897 } /* until end */
7898 if (argidx < arglen && !dict) {
7899 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007900 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901 goto onError;
7902 }
7903
Thomas Woutersa96affe2006-03-12 00:29:36 +00007904 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7905 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007906 if (args_owned) {
7907 Py_DECREF(args);
7908 }
7909 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007910 return (PyObject *)result;
7911
7912 onError:
7913 Py_XDECREF(result);
7914 Py_DECREF(uformat);
7915 if (args_owned) {
7916 Py_DECREF(args);
7917 }
7918 return NULL;
7919}
7920
7921static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007922 (readbufferproc) unicode_buffer_getreadbuf,
7923 (writebufferproc) unicode_buffer_getwritebuf,
7924 (segcountproc) unicode_buffer_getsegcount,
7925 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926};
7927
Jeremy Hylton938ace62002-07-17 16:30:39 +00007928static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007929unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7930
Tim Peters6d6c1a32001-08-02 04:15:00 +00007931static PyObject *
7932unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7933{
7934 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007935 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007936 char *encoding = NULL;
7937 char *errors = NULL;
7938
Guido van Rossume023fe02001-08-30 03:12:59 +00007939 if (type != &PyUnicode_Type)
7940 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007941 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7942 kwlist, &x, &encoding, &errors))
7943 return NULL;
7944 if (x == NULL)
7945 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007946 if (encoding == NULL && errors == NULL)
7947 return PyObject_Unicode(x);
7948 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007949 return PyUnicode_FromEncodedObject(x, encoding, errors);
7950}
7951
Guido van Rossume023fe02001-08-30 03:12:59 +00007952static PyObject *
7953unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7954{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007955 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007956 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007957
7958 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7959 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7960 if (tmp == NULL)
7961 return NULL;
7962 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007963 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007964 if (pnew == NULL) {
7965 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007966 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007967 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007968 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7969 if (pnew->str == NULL) {
7970 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007971 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007972 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007973 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007974 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007975 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7976 pnew->length = n;
7977 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007978 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007979 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007980}
7981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007982PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007983"unicode(string [, encoding[, errors]]) -> object\n\
7984\n\
7985Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007986encoding defaults to the current default string encoding.\n\
7987errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007988
Guido van Rossum50e9fb92006-08-17 05:42:55 +00007989static PyObject *unicode_iter(PyObject *seq);
7990
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991PyTypeObject PyUnicode_Type = {
7992 PyObject_HEAD_INIT(&PyType_Type)
7993 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00007994 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 sizeof(PyUnicodeObject), /* tp_size */
7996 0, /* tp_itemsize */
7997 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007998 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008000 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008002 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008003 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008004 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008006 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007 (hashfunc) unicode_hash, /* tp_hash*/
8008 0, /* tp_call*/
8009 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008010 PyObject_GenericGetAttr, /* tp_getattro */
8011 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008013 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8014 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008015 unicode_doc, /* tp_doc */
8016 0, /* tp_traverse */
8017 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008018 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008019 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008020 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008021 0, /* tp_iternext */
8022 unicode_methods, /* tp_methods */
8023 0, /* tp_members */
8024 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008025 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008026 0, /* tp_dict */
8027 0, /* tp_descr_get */
8028 0, /* tp_descr_set */
8029 0, /* tp_dictoffset */
8030 0, /* tp_init */
8031 0, /* tp_alloc */
8032 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008033 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034};
8035
8036/* Initialize the Unicode implementation */
8037
Thomas Wouters78890102000-07-22 19:25:51 +00008038void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008040 int i;
8041
Thomas Wouters477c8d52006-05-27 19:21:47 +00008042 /* XXX - move this array to unicodectype.c ? */
8043 Py_UNICODE linebreak[] = {
8044 0x000A, /* LINE FEED */
8045 0x000D, /* CARRIAGE RETURN */
8046 0x001C, /* FILE SEPARATOR */
8047 0x001D, /* GROUP SEPARATOR */
8048 0x001E, /* RECORD SEPARATOR */
8049 0x0085, /* NEXT LINE */
8050 0x2028, /* LINE SEPARATOR */
8051 0x2029, /* PARAGRAPH SEPARATOR */
8052 };
8053
Fred Drakee4315f52000-05-09 19:53:39 +00008054 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008055 unicode_freelist = NULL;
8056 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008058 if (!unicode_empty)
8059 return;
8060
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008061 for (i = 0; i < 256; i++)
8062 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008063 if (PyType_Ready(&PyUnicode_Type) < 0)
8064 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008065
8066 /* initialize the linebreak bloom filter */
8067 bloom_linebreak = make_bloom_mask(
8068 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8069 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008070
8071 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072}
8073
8074/* Finalize the Unicode implementation */
8075
8076void
Thomas Wouters78890102000-07-22 19:25:51 +00008077_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008079 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008080 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008082 Py_XDECREF(unicode_empty);
8083 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008084
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008085 for (i = 0; i < 256; i++) {
8086 if (unicode_latin1[i]) {
8087 Py_DECREF(unicode_latin1[i]);
8088 unicode_latin1[i] = NULL;
8089 }
8090 }
8091
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008092 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093 PyUnicodeObject *v = u;
8094 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008095 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008096 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008097 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008098 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008100 unicode_freelist = NULL;
8101 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008103
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008104
8105
8106/********************* Unicode Iterator **************************/
8107
8108typedef struct {
8109 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008110 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008111 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8112} unicodeiterobject;
8113
8114static void
8115unicodeiter_dealloc(unicodeiterobject *it)
8116{
8117 _PyObject_GC_UNTRACK(it);
8118 Py_XDECREF(it->it_seq);
8119 PyObject_GC_Del(it);
8120}
8121
8122static int
8123unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8124{
8125 Py_VISIT(it->it_seq);
8126 return 0;
8127}
8128
8129static PyObject *
8130unicodeiter_next(unicodeiterobject *it)
8131{
8132 PyUnicodeObject *seq;
8133 PyObject *item;
8134
8135 assert(it != NULL);
8136 seq = it->it_seq;
8137 if (seq == NULL)
8138 return NULL;
8139 assert(PyUnicode_Check(seq));
8140
8141 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008142 item = PyUnicode_FromUnicode(
8143 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008144 if (item != NULL)
8145 ++it->it_index;
8146 return item;
8147 }
8148
8149 Py_DECREF(seq);
8150 it->it_seq = NULL;
8151 return NULL;
8152}
8153
8154static PyObject *
8155unicodeiter_len(unicodeiterobject *it)
8156{
8157 Py_ssize_t len = 0;
8158 if (it->it_seq)
8159 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8160 return PyInt_FromSsize_t(len);
8161}
8162
8163PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8164
8165static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008166 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8167 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008168 {NULL, NULL} /* sentinel */
8169};
8170
8171PyTypeObject PyUnicodeIter_Type = {
8172 PyObject_HEAD_INIT(&PyType_Type)
8173 0, /* ob_size */
8174 "unicodeiterator", /* tp_name */
8175 sizeof(unicodeiterobject), /* tp_basicsize */
8176 0, /* tp_itemsize */
8177 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008178 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008179 0, /* tp_print */
8180 0, /* tp_getattr */
8181 0, /* tp_setattr */
8182 0, /* tp_compare */
8183 0, /* tp_repr */
8184 0, /* tp_as_number */
8185 0, /* tp_as_sequence */
8186 0, /* tp_as_mapping */
8187 0, /* tp_hash */
8188 0, /* tp_call */
8189 0, /* tp_str */
8190 PyObject_GenericGetAttr, /* tp_getattro */
8191 0, /* tp_setattro */
8192 0, /* tp_as_buffer */
8193 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8194 0, /* tp_doc */
8195 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8196 0, /* tp_clear */
8197 0, /* tp_richcompare */
8198 0, /* tp_weaklistoffset */
8199 PyObject_SelfIter, /* tp_iter */
8200 (iternextfunc)unicodeiter_next, /* tp_iternext */
8201 unicodeiter_methods, /* tp_methods */
8202 0,
8203};
8204
8205static PyObject *
8206unicode_iter(PyObject *seq)
8207{
8208 unicodeiterobject *it;
8209
8210 if (!PyUnicode_Check(seq)) {
8211 PyErr_BadInternalCall();
8212 return NULL;
8213 }
8214 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8215 if (it == NULL)
8216 return NULL;
8217 it->it_index = 0;
8218 Py_INCREF(seq);
8219 it->it_seq = (PyUnicodeObject *)seq;
8220 _PyObject_GC_TRACK(it);
8221 return (PyObject *)it;
8222}
8223
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008224#ifdef __cplusplus
8225}
8226#endif
8227
8228
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008229/*
8230Local variables:
8231c-basic-offset: 4
8232indent-tabs-mode: nil
8233End:
8234*/