blob: 1f6f738c6df8201225d95026ecef7964d126c2cc [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Andrew Dalkee0df7622006-05-27 11:04:36 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Martin v. Löwis68192102007-07-21 06:55:02 +0000315 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis68192102007-07-21 06:55:02 +0000329 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Martin v. Löwis68192102007-07-21 06:55:02 +0000557 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +0000607 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +0000717 Py_Type(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
799 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000800 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000801{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000802 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000803
804 PyObject *restuple = NULL;
805 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000806 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
807 Py_ssize_t requiredsize;
808 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000809 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000810 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000811 int res = -1;
812
813 if (*errorHandler == NULL) {
814 *errorHandler = PyCodec_LookupError(errors);
815 if (*errorHandler == NULL)
816 goto onError;
817 }
818
819 if (*exceptionObject == NULL) {
820 *exceptionObject = PyUnicodeDecodeError_Create(
821 encoding, input, insize, *startinpos, *endinpos, reason);
822 if (*exceptionObject == NULL)
823 goto onError;
824 }
825 else {
826 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
827 goto onError;
828 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
829 goto onError;
830 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
831 goto onError;
832 }
833
834 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
835 if (restuple == NULL)
836 goto onError;
837 if (!PyTuple_Check(restuple)) {
838 PyErr_Format(PyExc_TypeError, &argparse[4]);
839 goto onError;
840 }
841 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
842 goto onError;
843 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000844 newpos = insize+newpos;
845 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000846 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000847 goto onError;
848 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000849
850 /* need more space? (at least enough for what we
851 have+the replacement+the rest of the string (starting
852 at the new input position), so we won't have to check space
853 when there are no errors in the rest of the string) */
854 repptr = PyUnicode_AS_UNICODE(repunicode);
855 repsize = PyUnicode_GET_SIZE(repunicode);
856 requiredsize = *outpos + repsize + insize-newpos;
857 if (requiredsize > outsize) {
858 if (requiredsize<2*outsize)
859 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000860 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000861 goto onError;
862 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
863 }
864 *endinpos = newpos;
865 *inptr = input + newpos;
866 Py_UNICODE_COPY(*outptr, repptr, repsize);
867 *outptr += repsize;
868 *outpos += repsize;
869 /* we made it! */
870 res = 0;
871
872 onError:
873 Py_XDECREF(restuple);
874 return res;
875}
876
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000877/* --- UTF-7 Codec -------------------------------------------------------- */
878
879/* see RFC2152 for details */
880
Tim Petersced69f82003-09-16 20:30:58 +0000881static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000882char utf7_special[128] = {
883 /* indicate whether a UTF-7 character is special i.e. cannot be directly
884 encoded:
885 0 - not special
886 1 - special
887 2 - whitespace (optional)
888 3 - RFC2152 Set O (optional) */
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
891 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
892 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
893 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
894 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
895 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
897
898};
899
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000900/* Note: The comparison (c) <= 0 is a trick to work-around gcc
901 warnings about the comparison always being false; since
902 utf7_special[0] is 1, we can safely make that one comparison
903 true */
904
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000905#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000906 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000907 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 (encodeO && (utf7_special[(c)] == 3)))
909
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000910#define B64(n) \
911 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
912#define B64CHAR(c) \
913 (isalnum(c) || (c) == '+' || (c) == '/')
914#define UB64(c) \
915 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
916 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000917
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000918#define ENCODE(out, ch, bits) \
919 while (bits >= 6) { \
920 *out++ = B64(ch >> (bits-6)); \
921 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000922 }
923
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000924#define DECODE(out, ch, bits, surrogate) \
925 while (bits >= 16) { \
926 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
927 bits -= 16; \
928 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000929 /* We have already generated an error for the high surrogate \
930 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000931 surrogate = 0; \
932 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000933 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000934 it in a 16-bit character */ \
935 surrogate = 1; \
936 errmsg = "code pairs are not supported"; \
937 goto utf7Error; \
938 } else { \
939 *out++ = outCh; \
940 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000941 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000943PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000944 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945 const char *errors)
946{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000947 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000948 Py_ssize_t startinpos;
949 Py_ssize_t endinpos;
950 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000951 const char *e;
952 PyUnicodeObject *unicode;
953 Py_UNICODE *p;
954 const char *errmsg = "";
955 int inShift = 0;
956 unsigned int bitsleft = 0;
957 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000958 int surrogate = 0;
959 PyObject *errorHandler = NULL;
960 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000961
962 unicode = _PyUnicode_New(size);
963 if (!unicode)
964 return NULL;
965 if (size == 0)
966 return (PyObject *)unicode;
967
968 p = unicode->str;
969 e = s + size;
970
971 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000972 Py_UNICODE ch;
973 restart:
974 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000975
976 if (inShift) {
977 if ((ch == '-') || !B64CHAR(ch)) {
978 inShift = 0;
979 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000980
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000981 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
982 if (bitsleft >= 6) {
983 /* The shift sequence has a partial character in it. If
984 bitsleft < 6 then we could just classify it as padding
985 but that is not the case here */
986
987 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000988 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000989 }
990 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000991 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000992 here so indicate the potential of a misencoded character. */
993
994 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
995 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
996 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000997 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000998 }
999
1000 if (ch == '-') {
1001 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001002 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001003 inShift = 1;
1004 }
1005 } else if (SPECIAL(ch,0,0)) {
1006 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001007 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001008 } else {
1009 *p++ = ch;
1010 }
1011 } else {
1012 charsleft = (charsleft << 6) | UB64(ch);
1013 bitsleft += 6;
1014 s++;
1015 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1016 }
1017 }
1018 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001019 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001020 s++;
1021 if (s < e && *s == '-') {
1022 s++;
1023 *p++ = '+';
1024 } else
1025 {
1026 inShift = 1;
1027 bitsleft = 0;
1028 }
1029 }
1030 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001031 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001032 errmsg = "unexpected special character";
1033 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001034 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001035 }
1036 else {
1037 *p++ = ch;
1038 s++;
1039 }
1040 continue;
1041 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001042 outpos = p-PyUnicode_AS_UNICODE(unicode);
1043 endinpos = s-starts;
1044 if (unicode_decode_call_errorhandler(
1045 errors, &errorHandler,
1046 "utf7", errmsg,
1047 starts, size, &startinpos, &endinpos, &exc, &s,
1048 (PyObject **)&unicode, &outpos, &p))
1049 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001050 }
1051
1052 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001053 outpos = p-PyUnicode_AS_UNICODE(unicode);
1054 endinpos = size;
1055 if (unicode_decode_call_errorhandler(
1056 errors, &errorHandler,
1057 "utf7", "unterminated shift sequence",
1058 starts, size, &startinpos, &endinpos, &exc, &s,
1059 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001060 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001061 if (s < e)
1062 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 }
1064
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001065 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001066 goto onError;
1067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001068 Py_XDECREF(errorHandler);
1069 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001070 return (PyObject *)unicode;
1071
1072onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001073 Py_XDECREF(errorHandler);
1074 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001075 Py_DECREF(unicode);
1076 return NULL;
1077}
1078
1079
1080PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001081 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001082 int encodeSetO,
1083 int encodeWhiteSpace,
1084 const char *errors)
1085{
1086 PyObject *v;
1087 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001090 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001091 unsigned int bitsleft = 0;
1092 unsigned long charsleft = 0;
1093 char * out;
1094 char * start;
1095
1096 if (size == 0)
1097 return PyString_FromStringAndSize(NULL, 0);
1098
1099 v = PyString_FromStringAndSize(NULL, cbAllocated);
1100 if (v == NULL)
1101 return NULL;
1102
1103 start = out = PyString_AS_STRING(v);
1104 for (;i < size; ++i) {
1105 Py_UNICODE ch = s[i];
1106
1107 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001108 if (ch == '+') {
1109 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001110 *out++ = '-';
1111 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1112 charsleft = ch;
1113 bitsleft = 16;
1114 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001116 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001117 } else {
1118 *out++ = (char) ch;
1119 }
1120 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001121 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1122 *out++ = B64(charsleft << (6-bitsleft));
1123 charsleft = 0;
1124 bitsleft = 0;
1125 /* Characters not in the BASE64 set implicitly unshift the sequence
1126 so no '-' is required, except if the character is itself a '-' */
1127 if (B64CHAR(ch) || ch == '-') {
1128 *out++ = '-';
1129 }
1130 inShift = 0;
1131 *out++ = (char) ch;
1132 } else {
1133 bitsleft += 16;
1134 charsleft = (charsleft << 16) | ch;
1135 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1136
1137 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001138 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001139 or '-' then the shift sequence will be terminated implicitly and we
1140 don't have to insert a '-'. */
1141
1142 if (bitsleft == 0) {
1143 if (i + 1 < size) {
1144 Py_UNICODE ch2 = s[i+1];
1145
1146 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001147
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001148 } else if (B64CHAR(ch2) || ch2 == '-') {
1149 *out++ = '-';
1150 inShift = 0;
1151 } else {
1152 inShift = 0;
1153 }
1154
1155 }
1156 else {
1157 *out++ = '-';
1158 inShift = 0;
1159 }
1160 }
Tim Petersced69f82003-09-16 20:30:58 +00001161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001162 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001163 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001164 if (bitsleft) {
1165 *out++= B64(charsleft << (6-bitsleft) );
1166 *out++ = '-';
1167 }
1168
Tim Peters5de98422002-04-27 18:44:32 +00001169 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001170 return v;
1171}
1172
1173#undef SPECIAL
1174#undef B64
1175#undef B64CHAR
1176#undef UB64
1177#undef ENCODE
1178#undef DECODE
1179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180/* --- UTF-8 Codec -------------------------------------------------------- */
1181
Tim Petersced69f82003-09-16 20:30:58 +00001182static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183char utf8_code_length[256] = {
1184 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1185 illegal prefix. see RFC 2279 for details */
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1193 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1198 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1199 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1200 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1201 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1202};
1203
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001205 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206 const char *errors)
1207{
Walter Dörwald69652032004-09-07 20:24:22 +00001208 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1209}
1210
1211PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001212 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001213 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001214 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001215{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001216 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001218 Py_ssize_t startinpos;
1219 Py_ssize_t endinpos;
1220 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221 const char *e;
1222 PyUnicodeObject *unicode;
1223 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001225 PyObject *errorHandler = NULL;
1226 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227
1228 /* Note: size will always be longer than the resulting Unicode
1229 character count */
1230 unicode = _PyUnicode_New(size);
1231 if (!unicode)
1232 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001233 if (size == 0) {
1234 if (consumed)
1235 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238
1239 /* Unpack UTF-8 encoded data */
1240 p = unicode->str;
1241 e = s + size;
1242
1243 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001244 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245
1246 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001247 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 s++;
1249 continue;
1250 }
1251
1252 n = utf8_code_length[ch];
1253
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001254 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001255 if (consumed)
1256 break;
1257 else {
1258 errmsg = "unexpected end of data";
1259 startinpos = s-starts;
1260 endinpos = size;
1261 goto utf8Error;
1262 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264
1265 switch (n) {
1266
1267 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001268 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001269 startinpos = s-starts;
1270 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272
1273 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001274 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001275 startinpos = s-starts;
1276 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001277 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001278
1279 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001280 if ((s[1] & 0xc0) != 0x80) {
1281 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001282 startinpos = s-starts;
1283 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001284 goto utf8Error;
1285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001288 startinpos = s-starts;
1289 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001290 errmsg = "illegal encoding";
1291 goto utf8Error;
1292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001294 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295 break;
1296
1297 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001298 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001299 (s[2] & 0xc0) != 0x80) {
1300 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001301 startinpos = s-starts;
1302 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001303 goto utf8Error;
1304 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001305 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001306 if (ch < 0x0800) {
1307 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001308 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001309
1310 XXX For wide builds (UCS-4) we should probably try
1311 to recombine the surrogates into a single code
1312 unit.
1313 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001314 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001315 startinpos = s-starts;
1316 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001317 goto utf8Error;
1318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001320 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001321 break;
1322
1323 case 4:
1324 if ((s[1] & 0xc0) != 0x80 ||
1325 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001326 (s[3] & 0xc0) != 0x80) {
1327 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001328 startinpos = s-starts;
1329 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001330 goto utf8Error;
1331 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001332 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1333 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1334 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001336 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001338 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001339 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001340 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001341 startinpos = s-starts;
1342 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001343 goto utf8Error;
1344 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001345#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001346 *p++ = (Py_UNICODE)ch;
1347#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001349
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001350 /* translate from 10000..10FFFF to 0..FFFF */
1351 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001352
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001353 /* high surrogate = top 10 bits added to D800 */
1354 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001355
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001356 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001357 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001358#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 break;
1360
1361 default:
1362 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001363 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001364 startinpos = s-starts;
1365 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001366 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 }
1368 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001370
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001371 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001372 outpos = p-PyUnicode_AS_UNICODE(unicode);
1373 if (unicode_decode_call_errorhandler(
1374 errors, &errorHandler,
1375 "utf8", errmsg,
1376 starts, size, &startinpos, &endinpos, &exc, &s,
1377 (PyObject **)&unicode, &outpos, &p))
1378 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001379 }
Walter Dörwald69652032004-09-07 20:24:22 +00001380 if (consumed)
1381 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382
1383 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001384 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385 goto onError;
1386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001387 Py_XDECREF(errorHandler);
1388 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 return (PyObject *)unicode;
1390
1391onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001392 Py_XDECREF(errorHandler);
1393 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394 Py_DECREF(unicode);
1395 return NULL;
1396}
1397
Tim Peters602f7402002-04-27 18:03:26 +00001398/* Allocation strategy: if the string is short, convert into a stack buffer
1399 and allocate exactly as much space needed at the end. Else allocate the
1400 maximum possible needed (4 result bytes per Unicode character), and return
1401 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001402*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001403PyObject *
1404PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001405 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001406 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001407{
Tim Peters602f7402002-04-27 18:03:26 +00001408#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001409
Martin v. Löwis18e16552006-02-15 17:27:45 +00001410 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001411 PyObject *v; /* result string object */
1412 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001414 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001415 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001416
Tim Peters602f7402002-04-27 18:03:26 +00001417 assert(s != NULL);
1418 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001419
Tim Peters602f7402002-04-27 18:03:26 +00001420 if (size <= MAX_SHORT_UNICHARS) {
1421 /* Write into the stack buffer; nallocated can't overflow.
1422 * At the end, we'll allocate exactly as much heap space as it
1423 * turns out we need.
1424 */
1425 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1426 v = NULL; /* will allocate after we're done */
1427 p = stackbuf;
1428 }
1429 else {
1430 /* Overallocate on the heap, and give the excess back at the end. */
1431 nallocated = size * 4;
1432 if (nallocated / 4 != size) /* overflow! */
1433 return PyErr_NoMemory();
1434 v = PyString_FromStringAndSize(NULL, nallocated);
1435 if (v == NULL)
1436 return NULL;
1437 p = PyString_AS_STRING(v);
1438 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001439
Tim Peters602f7402002-04-27 18:03:26 +00001440 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001441 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001442
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001443 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001444 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001446
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001448 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001449 *p++ = (char)(0xc0 | (ch >> 6));
1450 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001451 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001452 else {
Tim Peters602f7402002-04-27 18:03:26 +00001453 /* Encode UCS2 Unicode ordinals */
1454 if (ch < 0x10000) {
1455 /* Special case: check for high surrogate */
1456 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1457 Py_UCS4 ch2 = s[i];
1458 /* Check for low surrogate and combine the two to
1459 form a UCS4 value */
1460 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001461 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001462 i++;
1463 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001464 }
Tim Peters602f7402002-04-27 18:03:26 +00001465 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001466 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001467 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001468 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1469 *p++ = (char)(0x80 | (ch & 0x3f));
1470 continue;
1471 }
1472encodeUCS4:
1473 /* Encode UCS4 Unicode ordinals */
1474 *p++ = (char)(0xf0 | (ch >> 18));
1475 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1476 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1477 *p++ = (char)(0x80 | (ch & 0x3f));
1478 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001479 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001480
Tim Peters602f7402002-04-27 18:03:26 +00001481 if (v == NULL) {
1482 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001483 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001484 assert(nneeded <= nallocated);
1485 v = PyString_FromStringAndSize(stackbuf, nneeded);
1486 }
1487 else {
1488 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001489 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001490 assert(nneeded <= nallocated);
1491 _PyString_Resize(&v, nneeded);
1492 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001493 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001494
Tim Peters602f7402002-04-27 18:03:26 +00001495#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496}
1497
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1499{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500 if (!PyUnicode_Check(unicode)) {
1501 PyErr_BadArgument();
1502 return NULL;
1503 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001504 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1505 PyUnicode_GET_SIZE(unicode),
1506 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507}
1508
Walter Dörwald6e390802007-08-17 16:41:28 +00001509/* --- UTF-32 Codec ------------------------------------------------------- */
1510
1511PyObject *
1512PyUnicode_DecodeUTF32(const char *s,
1513 Py_ssize_t size,
1514 const char *errors,
1515 int *byteorder)
1516{
1517 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
1518}
1519
1520PyObject *
1521PyUnicode_DecodeUTF32Stateful(const char *s,
1522 Py_ssize_t size,
1523 const char *errors,
1524 int *byteorder,
1525 Py_ssize_t *consumed)
1526{
1527 const char *starts = s;
1528 Py_ssize_t startinpos;
1529 Py_ssize_t endinpos;
1530 Py_ssize_t outpos;
1531 PyUnicodeObject *unicode;
1532 Py_UNICODE *p;
1533#ifndef Py_UNICODE_WIDE
1534 int i, pairs;
1535#else
1536 const int pairs = 0;
1537#endif
1538 const unsigned char *q, *e;
1539 int bo = 0; /* assume native ordering by default */
1540 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00001541 /* Offsets from q for retrieving bytes in the right order. */
1542#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1543 int iorder[] = {0, 1, 2, 3};
1544#else
1545 int iorder[] = {3, 2, 1, 0};
1546#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00001547 PyObject *errorHandler = NULL;
1548 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00001549 /* On narrow builds we split characters outside the BMP into two
1550 codepoints => count how much extra space we need. */
1551#ifndef Py_UNICODE_WIDE
1552 for (i = pairs = 0; i < size/4; i++)
1553 if (((Py_UCS4 *)s)[i] >= 0x10000)
1554 pairs++;
1555#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00001556
1557 /* This might be one to much, because of a BOM */
1558 unicode = _PyUnicode_New((size+3)/4+pairs);
1559 if (!unicode)
1560 return NULL;
1561 if (size == 0)
1562 return (PyObject *)unicode;
1563
1564 /* Unpack UTF-32 encoded data */
1565 p = unicode->str;
1566 q = (unsigned char *)s;
1567 e = q + size;
1568
1569 if (byteorder)
1570 bo = *byteorder;
1571
1572 /* Check for BOM marks (U+FEFF) in the input and adjust current
1573 byte order setting accordingly. In native mode, the leading BOM
1574 mark is skipped, in all other modes, it is copied to the output
1575 stream as-is (giving a ZWNBSP character). */
1576 if (bo == 0) {
1577 if (size >= 4) {
1578 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
1579 (q[iorder[1]] << 8) | q[iorder[0]];
1580#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1581 if (bom == 0x0000FEFF) {
1582 q += 4;
1583 bo = -1;
1584 }
1585 else if (bom == 0xFFFE0000) {
1586 q += 4;
1587 bo = 1;
1588 }
1589#else
1590 if (bom == 0x0000FEFF) {
1591 q += 4;
1592 bo = 1;
1593 }
1594 else if (bom == 0xFFFE0000) {
1595 q += 4;
1596 bo = -1;
1597 }
1598#endif
1599 }
1600 }
1601
1602 if (bo == -1) {
1603 /* force LE */
1604 iorder[0] = 0;
1605 iorder[1] = 1;
1606 iorder[2] = 2;
1607 iorder[3] = 3;
1608 }
1609 else if (bo == 1) {
1610 /* force BE */
1611 iorder[0] = 3;
1612 iorder[1] = 2;
1613 iorder[2] = 1;
1614 iorder[3] = 0;
1615 }
1616
1617 while (q < e) {
1618 Py_UCS4 ch;
1619 /* remaining bytes at the end? (size should be divisible by 4) */
1620 if (e-q<4) {
1621 if (consumed)
1622 break;
1623 errmsg = "truncated data";
1624 startinpos = ((const char *)q)-starts;
1625 endinpos = ((const char *)e)-starts;
1626 goto utf32Error;
1627 /* The remaining input chars are ignored if the callback
1628 chooses to skip the input */
1629 }
1630 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
1631 (q[iorder[1]] << 8) | q[iorder[0]];
1632
1633 if (ch >= 0x110000)
1634 {
1635 errmsg = "codepoint not in range(0x110000)";
1636 startinpos = ((const char *)q)-starts;
1637 endinpos = startinpos+4;
1638 goto utf32Error;
1639 }
1640#ifndef Py_UNICODE_WIDE
1641 if (ch >= 0x10000)
1642 {
1643 *p++ = 0xD800 | ((ch-0x10000) >> 10);
1644 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
1645 }
1646 else
1647#endif
1648 *p++ = ch;
1649 q += 4;
1650 continue;
1651 utf32Error:
1652 outpos = p-PyUnicode_AS_UNICODE(unicode);
1653 if (unicode_decode_call_errorhandler(
1654 errors, &errorHandler,
1655 "utf32", errmsg,
1656 starts, size, &startinpos, &endinpos, &exc, &s,
1657 (PyObject **)&unicode, &outpos, &p))
1658 goto onError;
1659 }
1660
1661 if (byteorder)
1662 *byteorder = bo;
1663
1664 if (consumed)
1665 *consumed = (const char *)q-starts;
1666
1667 /* Adjust length */
1668 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1669 goto onError;
1670
1671 Py_XDECREF(errorHandler);
1672 Py_XDECREF(exc);
1673 return (PyObject *)unicode;
1674
1675onError:
1676 Py_DECREF(unicode);
1677 Py_XDECREF(errorHandler);
1678 Py_XDECREF(exc);
1679 return NULL;
1680}
1681
1682PyObject *
1683PyUnicode_EncodeUTF32(const Py_UNICODE *s,
1684 Py_ssize_t size,
1685 const char *errors,
1686 int byteorder)
1687{
1688 PyObject *v;
1689 unsigned char *p;
1690#ifndef Py_UNICODE_WIDE
1691 int i, pairs;
1692#else
1693 const int pairs = 0;
1694#endif
1695 /* Offsets from p for storing byte pairs in the right order. */
1696#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1697 int iorder[] = {0, 1, 2, 3};
1698#else
1699 int iorder[] = {3, 2, 1, 0};
1700#endif
1701
1702#define STORECHAR(CH) \
1703 do { \
1704 p[iorder[3]] = ((CH) >> 24) & 0xff; \
1705 p[iorder[2]] = ((CH) >> 16) & 0xff; \
1706 p[iorder[1]] = ((CH) >> 8) & 0xff; \
1707 p[iorder[0]] = (CH) & 0xff; \
1708 p += 4; \
1709 } while(0)
1710
1711 /* In narrow builds we can output surrogate pairs as one codepoint,
1712 so we need less space. */
1713#ifndef Py_UNICODE_WIDE
1714 for (i = pairs = 0; i < size-1; i++)
1715 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
1716 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
1717 pairs++;
1718#endif
1719 v = PyString_FromStringAndSize(NULL,
1720 4 * (size - pairs + (byteorder == 0)));
1721 if (v == NULL)
1722 return NULL;
1723
1724 p = (unsigned char *)PyString_AS_STRING(v);
1725 if (byteorder == 0)
1726 STORECHAR(0xFEFF);
1727 if (size == 0)
1728 return v;
1729
1730 if (byteorder == -1) {
1731 /* force LE */
1732 iorder[0] = 0;
1733 iorder[1] = 1;
1734 iorder[2] = 2;
1735 iorder[3] = 3;
1736 }
1737 else if (byteorder == 1) {
1738 /* force BE */
1739 iorder[0] = 3;
1740 iorder[1] = 2;
1741 iorder[2] = 1;
1742 iorder[3] = 0;
1743 }
1744
1745 while (size-- > 0) {
1746 Py_UCS4 ch = *s++;
1747#ifndef Py_UNICODE_WIDE
1748 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
1749 Py_UCS4 ch2 = *s;
1750 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1751 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1752 s++;
1753 size--;
1754 }
1755 }
1756#endif
1757 STORECHAR(ch);
1758 }
1759 return v;
1760#undef STORECHAR
1761}
1762
1763PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
1764{
1765 if (!PyUnicode_Check(unicode)) {
1766 PyErr_BadArgument();
1767 return NULL;
1768 }
1769 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
1770 PyUnicode_GET_SIZE(unicode),
1771 NULL,
1772 0);
1773}
1774
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775/* --- UTF-16 Codec ------------------------------------------------------- */
1776
Tim Peters772747b2001-08-09 22:21:55 +00001777PyObject *
1778PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001779 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001780 const char *errors,
1781 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782{
Walter Dörwald69652032004-09-07 20:24:22 +00001783 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1784}
1785
1786PyObject *
1787PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001788 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001789 const char *errors,
1790 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001791 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001792{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001793 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001794 Py_ssize_t startinpos;
1795 Py_ssize_t endinpos;
1796 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 PyUnicodeObject *unicode;
1798 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001799 const unsigned char *q, *e;
1800 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001801 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001802 /* Offsets from q for retrieving byte pairs in the right order. */
1803#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1804 int ihi = 1, ilo = 0;
1805#else
1806 int ihi = 0, ilo = 1;
1807#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001808 PyObject *errorHandler = NULL;
1809 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810
1811 /* Note: size will always be longer than the resulting Unicode
1812 character count */
1813 unicode = _PyUnicode_New(size);
1814 if (!unicode)
1815 return NULL;
1816 if (size == 0)
1817 return (PyObject *)unicode;
1818
1819 /* Unpack UTF-16 encoded data */
1820 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001821 q = (unsigned char *)s;
1822 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823
1824 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001825 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001827 /* Check for BOM marks (U+FEFF) in the input and adjust current
1828 byte order setting accordingly. In native mode, the leading BOM
1829 mark is skipped, in all other modes, it is copied to the output
1830 stream as-is (giving a ZWNBSP character). */
1831 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001832 if (size >= 2) {
1833 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001834#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001835 if (bom == 0xFEFF) {
1836 q += 2;
1837 bo = -1;
1838 }
1839 else if (bom == 0xFFFE) {
1840 q += 2;
1841 bo = 1;
1842 }
Tim Petersced69f82003-09-16 20:30:58 +00001843#else
Walter Dörwald69652032004-09-07 20:24:22 +00001844 if (bom == 0xFEFF) {
1845 q += 2;
1846 bo = 1;
1847 }
1848 else if (bom == 0xFFFE) {
1849 q += 2;
1850 bo = -1;
1851 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001852#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001853 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001854 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855
Tim Peters772747b2001-08-09 22:21:55 +00001856 if (bo == -1) {
1857 /* force LE */
1858 ihi = 1;
1859 ilo = 0;
1860 }
1861 else if (bo == 1) {
1862 /* force BE */
1863 ihi = 0;
1864 ilo = 1;
1865 }
1866
1867 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001869 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001870 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001871 if (consumed)
1872 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001873 errmsg = "truncated data";
1874 startinpos = ((const char *)q)-starts;
1875 endinpos = ((const char *)e)-starts;
1876 goto utf16Error;
1877 /* The remaining input chars are ignored if the callback
1878 chooses to skip the input */
1879 }
1880 ch = (q[ihi] << 8) | q[ilo];
1881
Tim Peters772747b2001-08-09 22:21:55 +00001882 q += 2;
1883
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884 if (ch < 0xD800 || ch > 0xDFFF) {
1885 *p++ = ch;
1886 continue;
1887 }
1888
1889 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001890 if (q >= e) {
1891 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001892 startinpos = (((const char *)q)-2)-starts;
1893 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001894 goto utf16Error;
1895 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001896 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001897 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1898 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001899 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001900#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001901 *p++ = ch;
1902 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001903#else
1904 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001905#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001906 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 }
1908 else {
1909 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001910 startinpos = (((const char *)q)-4)-starts;
1911 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912 goto utf16Error;
1913 }
1914
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001916 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001917 startinpos = (((const char *)q)-2)-starts;
1918 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001919 /* Fall through to report the error */
1920
1921 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001922 outpos = p-PyUnicode_AS_UNICODE(unicode);
1923 if (unicode_decode_call_errorhandler(
1924 errors, &errorHandler,
1925 "utf16", errmsg,
1926 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1927 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001928 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929 }
1930
1931 if (byteorder)
1932 *byteorder = bo;
1933
Walter Dörwald69652032004-09-07 20:24:22 +00001934 if (consumed)
1935 *consumed = (const char *)q-starts;
1936
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001938 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 goto onError;
1940
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001941 Py_XDECREF(errorHandler);
1942 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943 return (PyObject *)unicode;
1944
1945onError:
1946 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001947 Py_XDECREF(errorHandler);
1948 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 return NULL;
1950}
1951
Tim Peters772747b2001-08-09 22:21:55 +00001952PyObject *
1953PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001954 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001955 const char *errors,
1956 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957{
1958 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001959 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001960#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001961 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001962#else
1963 const int pairs = 0;
1964#endif
Tim Peters772747b2001-08-09 22:21:55 +00001965 /* Offsets from p for storing byte pairs in the right order. */
1966#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1967 int ihi = 1, ilo = 0;
1968#else
1969 int ihi = 0, ilo = 1;
1970#endif
1971
1972#define STORECHAR(CH) \
1973 do { \
1974 p[ihi] = ((CH) >> 8) & 0xff; \
1975 p[ilo] = (CH) & 0xff; \
1976 p += 2; \
1977 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001979#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001980 for (i = pairs = 0; i < size; i++)
1981 if (s[i] >= 0x10000)
1982 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001983#endif
Tim Petersced69f82003-09-16 20:30:58 +00001984 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001985 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 if (v == NULL)
1987 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988
Tim Peters772747b2001-08-09 22:21:55 +00001989 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001991 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001992 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001993 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001994
1995 if (byteorder == -1) {
1996 /* force LE */
1997 ihi = 1;
1998 ilo = 0;
1999 }
2000 else if (byteorder == 1) {
2001 /* force BE */
2002 ihi = 0;
2003 ilo = 1;
2004 }
2005
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002006 while (size-- > 0) {
2007 Py_UNICODE ch = *s++;
2008 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002009#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002010 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002011 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2012 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002014#endif
Tim Peters772747b2001-08-09 22:21:55 +00002015 STORECHAR(ch);
2016 if (ch2)
2017 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002020#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021}
2022
2023PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2024{
2025 if (!PyUnicode_Check(unicode)) {
2026 PyErr_BadArgument();
2027 return NULL;
2028 }
2029 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2030 PyUnicode_GET_SIZE(unicode),
2031 NULL,
2032 0);
2033}
2034
2035/* --- Unicode Escape Codec ----------------------------------------------- */
2036
Fredrik Lundh06d12682001-01-24 07:59:11 +00002037static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002038
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002040 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 const char *errors)
2042{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002043 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002044 Py_ssize_t startinpos;
2045 Py_ssize_t endinpos;
2046 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002047 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002051 char* message;
2052 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053 PyObject *errorHandler = NULL;
2054 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002055
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 /* Escaped strings will always be longer than the resulting
2057 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002058 length after conversion to the true value.
2059 (but if the error callback returns a long replacement string
2060 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 v = _PyUnicode_New(size);
2062 if (v == NULL)
2063 goto onError;
2064 if (size == 0)
2065 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002066
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002069
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070 while (s < end) {
2071 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002072 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002073 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074
2075 /* Non-escape characters are interpreted as Unicode ordinals */
2076 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002077 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 continue;
2079 }
2080
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 /* \ - Escapes */
2083 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002084 c = *s++;
2085 if (s > end)
2086 c = '\0'; /* Invalid after \ */
2087 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088
2089 /* \x escapes */
2090 case '\n': break;
2091 case '\\': *p++ = '\\'; break;
2092 case '\'': *p++ = '\''; break;
2093 case '\"': *p++ = '\"'; break;
2094 case 'b': *p++ = '\b'; break;
2095 case 'f': *p++ = '\014'; break; /* FF */
2096 case 't': *p++ = '\t'; break;
2097 case 'n': *p++ = '\n'; break;
2098 case 'r': *p++ = '\r'; break;
2099 case 'v': *p++ = '\013'; break; /* VT */
2100 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2101
2102 /* \OOO (octal) escapes */
2103 case '0': case '1': case '2': case '3':
2104 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002105 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002106 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002107 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002108 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002109 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002111 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 break;
2113
Fredrik Lundhccc74732001-02-18 22:13:49 +00002114 /* hex escapes */
2115 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002117 digits = 2;
2118 message = "truncated \\xXX escape";
2119 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120
Fredrik Lundhccc74732001-02-18 22:13:49 +00002121 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002123 digits = 4;
2124 message = "truncated \\uXXXX escape";
2125 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126
Fredrik Lundhccc74732001-02-18 22:13:49 +00002127 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002128 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002129 digits = 8;
2130 message = "truncated \\UXXXXXXXX escape";
2131 hexescape:
2132 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002133 outpos = p-PyUnicode_AS_UNICODE(v);
2134 if (s+digits>end) {
2135 endinpos = size;
2136 if (unicode_decode_call_errorhandler(
2137 errors, &errorHandler,
2138 "unicodeescape", "end of string in escape sequence",
2139 starts, size, &startinpos, &endinpos, &exc, &s,
2140 (PyObject **)&v, &outpos, &p))
2141 goto onError;
2142 goto nextByte;
2143 }
2144 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002145 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002146 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002147 endinpos = (s+i+1)-starts;
2148 if (unicode_decode_call_errorhandler(
2149 errors, &errorHandler,
2150 "unicodeescape", message,
2151 starts, size, &startinpos, &endinpos, &exc, &s,
2152 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002153 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002154 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002155 }
2156 chr = (chr<<4) & ~0xF;
2157 if (c >= '0' && c <= '9')
2158 chr += c - '0';
2159 else if (c >= 'a' && c <= 'f')
2160 chr += 10 + c - 'a';
2161 else
2162 chr += 10 + c - 'A';
2163 }
2164 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002165 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002166 /* _decoding_error will have already written into the
2167 target buffer. */
2168 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002169 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002170 /* when we get here, chr is a 32-bit unicode character */
2171 if (chr <= 0xffff)
2172 /* UCS-2 character */
2173 *p++ = (Py_UNICODE) chr;
2174 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002175 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002176 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002177#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002178 *p++ = chr;
2179#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002180 chr -= 0x10000L;
2181 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002182 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002183#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002184 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002185 endinpos = s-starts;
2186 outpos = p-PyUnicode_AS_UNICODE(v);
2187 if (unicode_decode_call_errorhandler(
2188 errors, &errorHandler,
2189 "unicodeescape", "illegal Unicode character",
2190 starts, size, &startinpos, &endinpos, &exc, &s,
2191 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002192 goto onError;
2193 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002194 break;
2195
2196 /* \N{name} */
2197 case 'N':
2198 message = "malformed \\N character escape";
2199 if (ucnhash_CAPI == NULL) {
2200 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002201 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002202 m = PyImport_ImportModule("unicodedata");
2203 if (m == NULL)
2204 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002205 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002206 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002207 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002208 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002209 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002210 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002211 if (ucnhash_CAPI == NULL)
2212 goto ucnhashError;
2213 }
2214 if (*s == '{') {
2215 const char *start = s+1;
2216 /* look for the closing brace */
2217 while (*s != '}' && s < end)
2218 s++;
2219 if (s > start && s < end && *s == '}') {
2220 /* found a name. look it up in the unicode database */
2221 message = "unknown Unicode character name";
2222 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002223 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002224 goto store;
2225 }
2226 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002227 endinpos = s-starts;
2228 outpos = p-PyUnicode_AS_UNICODE(v);
2229 if (unicode_decode_call_errorhandler(
2230 errors, &errorHandler,
2231 "unicodeescape", message,
2232 starts, size, &startinpos, &endinpos, &exc, &s,
2233 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002234 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002235 break;
2236
2237 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002238 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002239 message = "\\ at end of string";
2240 s--;
2241 endinpos = s-starts;
2242 outpos = p-PyUnicode_AS_UNICODE(v);
2243 if (unicode_decode_call_errorhandler(
2244 errors, &errorHandler,
2245 "unicodeescape", message,
2246 starts, size, &startinpos, &endinpos, &exc, &s,
2247 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002248 goto onError;
2249 }
2250 else {
2251 *p++ = '\\';
2252 *p++ = (unsigned char)s[-1];
2253 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002254 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002256 nextByte:
2257 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002259 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002260 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002261 Py_XDECREF(errorHandler);
2262 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002264
Fredrik Lundhccc74732001-02-18 22:13:49 +00002265ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002266 PyErr_SetString(
2267 PyExc_UnicodeError,
2268 "\\N escapes not supported (can't load unicodedata module)"
2269 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002270 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002271 Py_XDECREF(errorHandler);
2272 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002273 return NULL;
2274
Fredrik Lundhccc74732001-02-18 22:13:49 +00002275onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002277 Py_XDECREF(errorHandler);
2278 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 return NULL;
2280}
2281
2282/* Return a Unicode-Escape string version of the Unicode object.
2283
2284 If quotes is true, the string is enclosed in u"" or u'' quotes as
2285 appropriate.
2286
2287*/
2288
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002289Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002290 Py_ssize_t size,
2291 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002292{
2293 /* like wcschr, but doesn't stop at NULL characters */
2294
2295 while (size-- > 0) {
2296 if (*s == ch)
2297 return s;
2298 s++;
2299 }
2300
2301 return NULL;
2302}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002303
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304static
2305PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002306 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307 int quotes)
2308{
2309 PyObject *repr;
2310 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002311
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002312 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313
Neal Norwitz17753ec2006-08-21 22:21:19 +00002314 /* XXX(nnorwitz): rather than over-allocating, it would be
2315 better to choose a different scheme. Perhaps scan the
2316 first N-chars of the string and allocate based on that size.
2317 */
2318 /* Initial allocation is based on the longest-possible unichr
2319 escape.
2320
2321 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2322 unichr, so in this case it's the longest unichr escape. In
2323 narrow (UTF-16) builds this is five chars per source unichr
2324 since there are two unichrs in the surrogate pair, so in narrow
2325 (UTF-16) builds it's not the longest unichr escape.
2326
2327 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2328 so in the narrow (UTF-16) build case it's the longest unichr
2329 escape.
2330 */
2331
2332 repr = PyString_FromStringAndSize(NULL,
2333 2
2334#ifdef Py_UNICODE_WIDE
2335 + 10*size
2336#else
2337 + 6*size
2338#endif
2339 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340 if (repr == NULL)
2341 return NULL;
2342
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002343 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344
2345 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002346 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002347 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002348 !findchar(s, size, '"')) ? '"' : '\'';
2349 }
2350 while (size-- > 0) {
2351 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002352
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002353 /* Escape quotes and backslashes */
2354 if ((quotes &&
2355 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356 *p++ = '\\';
2357 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002358 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002359 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002360
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002361#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002362 /* Map 21-bit characters to '\U00xxxxxx' */
2363 else if (ch >= 0x10000) {
2364 *p++ = '\\';
2365 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002366 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2367 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2368 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2369 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2370 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2371 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2372 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002373 *p++ = hexdigit[ch & 0x0000000F];
2374 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002375 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002376#else
2377 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002378 else if (ch >= 0xD800 && ch < 0xDC00) {
2379 Py_UNICODE ch2;
2380 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002381
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002382 ch2 = *s++;
2383 size--;
2384 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2385 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2386 *p++ = '\\';
2387 *p++ = 'U';
2388 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2389 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2390 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2391 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2392 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2393 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2394 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2395 *p++ = hexdigit[ucs & 0x0000000F];
2396 continue;
2397 }
2398 /* Fall through: isolated surrogates are copied as-is */
2399 s--;
2400 size++;
2401 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002402#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002403
Guido van Rossumd57fd912000-03-10 22:53:23 +00002404 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002405 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406 *p++ = '\\';
2407 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002408 *p++ = hexdigit[(ch >> 12) & 0x000F];
2409 *p++ = hexdigit[(ch >> 8) & 0x000F];
2410 *p++ = hexdigit[(ch >> 4) & 0x000F];
2411 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002412 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002413
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002414 /* Map special whitespace to '\t', \n', '\r' */
2415 else if (ch == '\t') {
2416 *p++ = '\\';
2417 *p++ = 't';
2418 }
2419 else if (ch == '\n') {
2420 *p++ = '\\';
2421 *p++ = 'n';
2422 }
2423 else if (ch == '\r') {
2424 *p++ = '\\';
2425 *p++ = 'r';
2426 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002427
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002428 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002429 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002430 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002431 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002432 *p++ = hexdigit[(ch >> 4) & 0x000F];
2433 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002434 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002435
Guido van Rossumd57fd912000-03-10 22:53:23 +00002436 /* Copy everything else as-is */
2437 else
2438 *p++ = (char) ch;
2439 }
2440 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002441 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442
2443 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002444 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002445 return repr;
2446}
2447
2448PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002449 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002450{
2451 return unicodeescape_string(s, size, 0);
2452}
2453
2454PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2455{
2456 if (!PyUnicode_Check(unicode)) {
2457 PyErr_BadArgument();
2458 return NULL;
2459 }
2460 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2461 PyUnicode_GET_SIZE(unicode));
2462}
2463
2464/* --- Raw Unicode Escape Codec ------------------------------------------- */
2465
2466PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002467 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 const char *errors)
2469{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002470 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002471 Py_ssize_t startinpos;
2472 Py_ssize_t endinpos;
2473 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002475 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 const char *end;
2477 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002478 PyObject *errorHandler = NULL;
2479 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002480
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481 /* Escaped strings will always be longer than the resulting
2482 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002483 length after conversion to the true value. (But decoding error
2484 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485 v = _PyUnicode_New(size);
2486 if (v == NULL)
2487 goto onError;
2488 if (size == 0)
2489 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002490 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 end = s + size;
2492 while (s < end) {
2493 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002494 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002496 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497
2498 /* Non-escape characters are interpreted as Unicode ordinals */
2499 if (*s != '\\') {
2500 *p++ = (unsigned char)*s++;
2501 continue;
2502 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002503 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504
2505 /* \u-escapes are only interpreted iff the number of leading
2506 backslashes if odd */
2507 bs = s;
2508 for (;s < end;) {
2509 if (*s != '\\')
2510 break;
2511 *p++ = (unsigned char)*s++;
2512 }
2513 if (((s - bs) & 1) == 0 ||
2514 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002515 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 continue;
2517 }
2518 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002519 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520 s++;
2521
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002522 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002523 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002524 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002525 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002527 endinpos = s-starts;
2528 if (unicode_decode_call_errorhandler(
2529 errors, &errorHandler,
2530 "rawunicodeescape", "truncated \\uXXXX",
2531 starts, size, &startinpos, &endinpos, &exc, &s,
2532 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002534 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 }
2536 x = (x<<4) & ~0xF;
2537 if (c >= '0' && c <= '9')
2538 x += c - '0';
2539 else if (c >= 'a' && c <= 'f')
2540 x += 10 + c - 'a';
2541 else
2542 x += 10 + c - 'A';
2543 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002544#ifndef Py_UNICODE_WIDE
2545 if (x > 0x10000) {
2546 if (unicode_decode_call_errorhandler(
2547 errors, &errorHandler,
2548 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2549 starts, size, &startinpos, &endinpos, &exc, &s,
2550 (PyObject **)&v, &outpos, &p))
2551 goto onError;
2552 }
2553#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002554 *p++ = x;
2555 nextByte:
2556 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002558 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002559 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002560 Py_XDECREF(errorHandler);
2561 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002563
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564 onError:
2565 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002566 Py_XDECREF(errorHandler);
2567 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568 return NULL;
2569}
2570
2571PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002572 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002573{
2574 PyObject *repr;
2575 char *p;
2576 char *q;
2577
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002578 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002580#ifdef Py_UNICODE_WIDE
2581 repr = PyString_FromStringAndSize(NULL, 10 * size);
2582#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002584#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585 if (repr == NULL)
2586 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002587 if (size == 0)
2588 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589
2590 p = q = PyString_AS_STRING(repr);
2591 while (size-- > 0) {
2592 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002593#ifdef Py_UNICODE_WIDE
2594 /* Map 32-bit characters to '\Uxxxxxxxx' */
2595 if (ch >= 0x10000) {
2596 *p++ = '\\';
2597 *p++ = 'U';
2598 *p++ = hexdigit[(ch >> 28) & 0xf];
2599 *p++ = hexdigit[(ch >> 24) & 0xf];
2600 *p++ = hexdigit[(ch >> 20) & 0xf];
2601 *p++ = hexdigit[(ch >> 16) & 0xf];
2602 *p++ = hexdigit[(ch >> 12) & 0xf];
2603 *p++ = hexdigit[(ch >> 8) & 0xf];
2604 *p++ = hexdigit[(ch >> 4) & 0xf];
2605 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002606 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002607 else
2608#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 /* Map 16-bit characters to '\uxxxx' */
2610 if (ch >= 256) {
2611 *p++ = '\\';
2612 *p++ = 'u';
2613 *p++ = hexdigit[(ch >> 12) & 0xf];
2614 *p++ = hexdigit[(ch >> 8) & 0xf];
2615 *p++ = hexdigit[(ch >> 4) & 0xf];
2616 *p++ = hexdigit[ch & 15];
2617 }
2618 /* Copy everything else as-is */
2619 else
2620 *p++ = (char) ch;
2621 }
2622 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002623 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 return repr;
2625}
2626
2627PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2628{
2629 if (!PyUnicode_Check(unicode)) {
2630 PyErr_BadArgument();
2631 return NULL;
2632 }
2633 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2634 PyUnicode_GET_SIZE(unicode));
2635}
2636
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002637/* --- Unicode Internal Codec ------------------------------------------- */
2638
2639PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002640 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002641 const char *errors)
2642{
2643 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002644 Py_ssize_t startinpos;
2645 Py_ssize_t endinpos;
2646 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002647 PyUnicodeObject *v;
2648 Py_UNICODE *p;
2649 const char *end;
2650 const char *reason;
2651 PyObject *errorHandler = NULL;
2652 PyObject *exc = NULL;
2653
Neal Norwitzd43069c2006-01-08 01:12:10 +00002654#ifdef Py_UNICODE_WIDE
2655 Py_UNICODE unimax = PyUnicode_GetMax();
2656#endif
2657
Armin Rigo7ccbca92006-10-04 12:17:45 +00002658 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002659 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2660 if (v == NULL)
2661 goto onError;
2662 if (PyUnicode_GetSize((PyObject *)v) == 0)
2663 return (PyObject *)v;
2664 p = PyUnicode_AS_UNICODE(v);
2665 end = s + size;
2666
2667 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002668 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002669 /* We have to sanity check the raw data, otherwise doom looms for
2670 some malformed UCS-4 data. */
2671 if (
2672 #ifdef Py_UNICODE_WIDE
2673 *p > unimax || *p < 0 ||
2674 #endif
2675 end-s < Py_UNICODE_SIZE
2676 )
2677 {
2678 startinpos = s - starts;
2679 if (end-s < Py_UNICODE_SIZE) {
2680 endinpos = end-starts;
2681 reason = "truncated input";
2682 }
2683 else {
2684 endinpos = s - starts + Py_UNICODE_SIZE;
2685 reason = "illegal code point (> 0x10FFFF)";
2686 }
2687 outpos = p - PyUnicode_AS_UNICODE(v);
2688 if (unicode_decode_call_errorhandler(
2689 errors, &errorHandler,
2690 "unicode_internal", reason,
2691 starts, size, &startinpos, &endinpos, &exc, &s,
2692 (PyObject **)&v, &outpos, &p)) {
2693 goto onError;
2694 }
2695 }
2696 else {
2697 p++;
2698 s += Py_UNICODE_SIZE;
2699 }
2700 }
2701
Martin v. Löwis412fb672006-04-13 06:34:32 +00002702 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002703 goto onError;
2704 Py_XDECREF(errorHandler);
2705 Py_XDECREF(exc);
2706 return (PyObject *)v;
2707
2708 onError:
2709 Py_XDECREF(v);
2710 Py_XDECREF(errorHandler);
2711 Py_XDECREF(exc);
2712 return NULL;
2713}
2714
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715/* --- Latin-1 Codec ------------------------------------------------------ */
2716
2717PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002718 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719 const char *errors)
2720{
2721 PyUnicodeObject *v;
2722 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002723
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002725 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002726 Py_UNICODE r = *(unsigned char*)s;
2727 return PyUnicode_FromUnicode(&r, 1);
2728 }
2729
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 v = _PyUnicode_New(size);
2731 if (v == NULL)
2732 goto onError;
2733 if (size == 0)
2734 return (PyObject *)v;
2735 p = PyUnicode_AS_UNICODE(v);
2736 while (size-- > 0)
2737 *p++ = (unsigned char)*s++;
2738 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002739
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 onError:
2741 Py_XDECREF(v);
2742 return NULL;
2743}
2744
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002745/* create or adjust a UnicodeEncodeError */
2746static void make_encode_exception(PyObject **exceptionObject,
2747 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002748 const Py_UNICODE *unicode, Py_ssize_t size,
2749 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002750 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002752 if (*exceptionObject == NULL) {
2753 *exceptionObject = PyUnicodeEncodeError_Create(
2754 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 }
2756 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002757 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2758 goto onError;
2759 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2760 goto onError;
2761 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2762 goto onError;
2763 return;
2764 onError:
2765 Py_DECREF(*exceptionObject);
2766 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 }
2768}
2769
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770/* raises a UnicodeEncodeError */
2771static void raise_encode_exception(PyObject **exceptionObject,
2772 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002773 const Py_UNICODE *unicode, Py_ssize_t size,
2774 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002775 const char *reason)
2776{
2777 make_encode_exception(exceptionObject,
2778 encoding, unicode, size, startpos, endpos, reason);
2779 if (*exceptionObject != NULL)
2780 PyCodec_StrictErrors(*exceptionObject);
2781}
2782
2783/* error handling callback helper:
2784 build arguments, call the callback and check the arguments,
2785 put the result into newpos and return the replacement string, which
2786 has to be freed by the caller */
2787static PyObject *unicode_encode_call_errorhandler(const char *errors,
2788 PyObject **errorHandler,
2789 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002790 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2791 Py_ssize_t startpos, Py_ssize_t endpos,
2792 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002794 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002795
2796 PyObject *restuple;
2797 PyObject *resunicode;
2798
2799 if (*errorHandler == NULL) {
2800 *errorHandler = PyCodec_LookupError(errors);
2801 if (*errorHandler == NULL)
2802 return NULL;
2803 }
2804
2805 make_encode_exception(exceptionObject,
2806 encoding, unicode, size, startpos, endpos, reason);
2807 if (*exceptionObject == NULL)
2808 return NULL;
2809
2810 restuple = PyObject_CallFunctionObjArgs(
2811 *errorHandler, *exceptionObject, NULL);
2812 if (restuple == NULL)
2813 return NULL;
2814 if (!PyTuple_Check(restuple)) {
2815 PyErr_Format(PyExc_TypeError, &argparse[4]);
2816 Py_DECREF(restuple);
2817 return NULL;
2818 }
2819 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2820 &resunicode, newpos)) {
2821 Py_DECREF(restuple);
2822 return NULL;
2823 }
2824 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002825 *newpos = size+*newpos;
2826 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002827 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002828 Py_DECREF(restuple);
2829 return NULL;
2830 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 Py_INCREF(resunicode);
2832 Py_DECREF(restuple);
2833 return resunicode;
2834}
2835
2836static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002837 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 const char *errors,
2839 int limit)
2840{
2841 /* output object */
2842 PyObject *res;
2843 /* pointers to the beginning and end+1 of input */
2844 const Py_UNICODE *startp = p;
2845 const Py_UNICODE *endp = p + size;
2846 /* pointer to the beginning of the unencodable characters */
2847 /* const Py_UNICODE *badp = NULL; */
2848 /* pointer into the output */
2849 char *str;
2850 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002851 Py_ssize_t respos = 0;
2852 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002853 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2854 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002855 PyObject *errorHandler = NULL;
2856 PyObject *exc = NULL;
2857 /* the following variable is used for caching string comparisons
2858 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2859 int known_errorHandler = -1;
2860
2861 /* allocate enough for a simple encoding without
2862 replacements, if we need more, we'll resize */
2863 res = PyString_FromStringAndSize(NULL, size);
2864 if (res == NULL)
2865 goto onError;
2866 if (size == 0)
2867 return res;
2868 str = PyString_AS_STRING(res);
2869 ressize = size;
2870
2871 while (p<endp) {
2872 Py_UNICODE c = *p;
2873
2874 /* can we encode this? */
2875 if (c<limit) {
2876 /* no overflow check, because we know that the space is enough */
2877 *str++ = (char)c;
2878 ++p;
2879 }
2880 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002881 Py_ssize_t unicodepos = p-startp;
2882 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002883 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002884 Py_ssize_t repsize;
2885 Py_ssize_t newpos;
2886 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887 Py_UNICODE *uni2;
2888 /* startpos for collecting unencodable chars */
2889 const Py_UNICODE *collstart = p;
2890 const Py_UNICODE *collend = p;
2891 /* find all unecodable characters */
2892 while ((collend < endp) && ((*collend)>=limit))
2893 ++collend;
2894 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2895 if (known_errorHandler==-1) {
2896 if ((errors==NULL) || (!strcmp(errors, "strict")))
2897 known_errorHandler = 1;
2898 else if (!strcmp(errors, "replace"))
2899 known_errorHandler = 2;
2900 else if (!strcmp(errors, "ignore"))
2901 known_errorHandler = 3;
2902 else if (!strcmp(errors, "xmlcharrefreplace"))
2903 known_errorHandler = 4;
2904 else
2905 known_errorHandler = 0;
2906 }
2907 switch (known_errorHandler) {
2908 case 1: /* strict */
2909 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2910 goto onError;
2911 case 2: /* replace */
2912 while (collstart++<collend)
2913 *str++ = '?'; /* fall through */
2914 case 3: /* ignore */
2915 p = collend;
2916 break;
2917 case 4: /* xmlcharrefreplace */
2918 respos = str-PyString_AS_STRING(res);
2919 /* determine replacement size (temporarily (mis)uses p) */
2920 for (p = collstart, repsize = 0; p < collend; ++p) {
2921 if (*p<10)
2922 repsize += 2+1+1;
2923 else if (*p<100)
2924 repsize += 2+2+1;
2925 else if (*p<1000)
2926 repsize += 2+3+1;
2927 else if (*p<10000)
2928 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002929#ifndef Py_UNICODE_WIDE
2930 else
2931 repsize += 2+5+1;
2932#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002933 else if (*p<100000)
2934 repsize += 2+5+1;
2935 else if (*p<1000000)
2936 repsize += 2+6+1;
2937 else
2938 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002939#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002940 }
2941 requiredsize = respos+repsize+(endp-collend);
2942 if (requiredsize > ressize) {
2943 if (requiredsize<2*ressize)
2944 requiredsize = 2*ressize;
2945 if (_PyString_Resize(&res, requiredsize))
2946 goto onError;
2947 str = PyString_AS_STRING(res) + respos;
2948 ressize = requiredsize;
2949 }
2950 /* generate replacement (temporarily (mis)uses p) */
2951 for (p = collstart; p < collend; ++p) {
2952 str += sprintf(str, "&#%d;", (int)*p);
2953 }
2954 p = collend;
2955 break;
2956 default:
2957 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2958 encoding, reason, startp, size, &exc,
2959 collstart-startp, collend-startp, &newpos);
2960 if (repunicode == NULL)
2961 goto onError;
2962 /* need more space? (at least enough for what we
2963 have+the replacement+the rest of the string, so
2964 we won't have to check space for encodable characters) */
2965 respos = str-PyString_AS_STRING(res);
2966 repsize = PyUnicode_GET_SIZE(repunicode);
2967 requiredsize = respos+repsize+(endp-collend);
2968 if (requiredsize > ressize) {
2969 if (requiredsize<2*ressize)
2970 requiredsize = 2*ressize;
2971 if (_PyString_Resize(&res, requiredsize)) {
2972 Py_DECREF(repunicode);
2973 goto onError;
2974 }
2975 str = PyString_AS_STRING(res) + respos;
2976 ressize = requiredsize;
2977 }
2978 /* check if there is anything unencodable in the replacement
2979 and copy it to the output */
2980 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2981 c = *uni2;
2982 if (c >= limit) {
2983 raise_encode_exception(&exc, encoding, startp, size,
2984 unicodepos, unicodepos+1, reason);
2985 Py_DECREF(repunicode);
2986 goto onError;
2987 }
2988 *str = (char)c;
2989 }
2990 p = startp + newpos;
2991 Py_DECREF(repunicode);
2992 }
2993 }
2994 }
2995 /* Resize if we allocated to much */
2996 respos = str-PyString_AS_STRING(res);
2997 if (respos<ressize)
2998 /* If this falls res will be NULL */
2999 _PyString_Resize(&res, respos);
3000 Py_XDECREF(errorHandler);
3001 Py_XDECREF(exc);
3002 return res;
3003
3004 onError:
3005 Py_XDECREF(res);
3006 Py_XDECREF(errorHandler);
3007 Py_XDECREF(exc);
3008 return NULL;
3009}
3010
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003012 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013 const char *errors)
3014{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003015 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016}
3017
3018PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3019{
3020 if (!PyUnicode_Check(unicode)) {
3021 PyErr_BadArgument();
3022 return NULL;
3023 }
3024 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3025 PyUnicode_GET_SIZE(unicode),
3026 NULL);
3027}
3028
3029/* --- 7-bit ASCII Codec -------------------------------------------------- */
3030
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003032 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 const char *errors)
3034{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003035 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 PyUnicodeObject *v;
3037 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003038 Py_ssize_t startinpos;
3039 Py_ssize_t endinpos;
3040 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003041 const char *e;
3042 PyObject *errorHandler = NULL;
3043 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003044
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003046 if (size == 1 && *(unsigned char*)s < 128) {
3047 Py_UNICODE r = *(unsigned char*)s;
3048 return PyUnicode_FromUnicode(&r, 1);
3049 }
Tim Petersced69f82003-09-16 20:30:58 +00003050
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 v = _PyUnicode_New(size);
3052 if (v == NULL)
3053 goto onError;
3054 if (size == 0)
3055 return (PyObject *)v;
3056 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003057 e = s + size;
3058 while (s < e) {
3059 register unsigned char c = (unsigned char)*s;
3060 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003062 ++s;
3063 }
3064 else {
3065 startinpos = s-starts;
3066 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003067 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068 if (unicode_decode_call_errorhandler(
3069 errors, &errorHandler,
3070 "ascii", "ordinal not in range(128)",
3071 starts, size, &startinpos, &endinpos, &exc, &s,
3072 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003076 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003077 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003078 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 Py_XDECREF(errorHandler);
3080 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003082
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083 onError:
3084 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003085 Py_XDECREF(errorHandler);
3086 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 return NULL;
3088}
3089
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003091 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092 const char *errors)
3093{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095}
3096
3097PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3098{
3099 if (!PyUnicode_Check(unicode)) {
3100 PyErr_BadArgument();
3101 return NULL;
3102 }
3103 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3104 PyUnicode_GET_SIZE(unicode),
3105 NULL);
3106}
3107
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003108#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003109
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003110/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003111
Martin v. Löwisd8251432006-06-14 05:21:04 +00003112#if SIZEOF_INT < SIZEOF_SSIZE_T
3113#define NEED_RETRY
3114#endif
3115
3116/* XXX This code is limited to "true" double-byte encodings, as
3117 a) it assumes an incomplete character consists of a single byte, and
3118 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3119 encodings, see IsDBCSLeadByteEx documentation. */
3120
3121static int is_dbcs_lead_byte(const char *s, int offset)
3122{
3123 const char *curr = s + offset;
3124
3125 if (IsDBCSLeadByte(*curr)) {
3126 const char *prev = CharPrev(s, curr);
3127 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3128 }
3129 return 0;
3130}
3131
3132/*
3133 * Decode MBCS string into unicode object. If 'final' is set, converts
3134 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3135 */
3136static int decode_mbcs(PyUnicodeObject **v,
3137 const char *s, /* MBCS string */
3138 int size, /* sizeof MBCS string */
3139 int final)
3140{
3141 Py_UNICODE *p;
3142 Py_ssize_t n = 0;
3143 int usize = 0;
3144
3145 assert(size >= 0);
3146
3147 /* Skip trailing lead-byte unless 'final' is set */
3148 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3149 --size;
3150
3151 /* First get the size of the result */
3152 if (size > 0) {
3153 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3154 if (usize == 0) {
3155 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3156 return -1;
3157 }
3158 }
3159
3160 if (*v == NULL) {
3161 /* Create unicode object */
3162 *v = _PyUnicode_New(usize);
3163 if (*v == NULL)
3164 return -1;
3165 }
3166 else {
3167 /* Extend unicode object */
3168 n = PyUnicode_GET_SIZE(*v);
3169 if (_PyUnicode_Resize(v, n + usize) < 0)
3170 return -1;
3171 }
3172
3173 /* Do the conversion */
3174 if (size > 0) {
3175 p = PyUnicode_AS_UNICODE(*v) + n;
3176 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3177 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3178 return -1;
3179 }
3180 }
3181
3182 return size;
3183}
3184
3185PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3186 Py_ssize_t size,
3187 const char *errors,
3188 Py_ssize_t *consumed)
3189{
3190 PyUnicodeObject *v = NULL;
3191 int done;
3192
3193 if (consumed)
3194 *consumed = 0;
3195
3196#ifdef NEED_RETRY
3197 retry:
3198 if (size > INT_MAX)
3199 done = decode_mbcs(&v, s, INT_MAX, 0);
3200 else
3201#endif
3202 done = decode_mbcs(&v, s, (int)size, !consumed);
3203
3204 if (done < 0) {
3205 Py_XDECREF(v);
3206 return NULL;
3207 }
3208
3209 if (consumed)
3210 *consumed += done;
3211
3212#ifdef NEED_RETRY
3213 if (size > INT_MAX) {
3214 s += done;
3215 size -= done;
3216 goto retry;
3217 }
3218#endif
3219
3220 return (PyObject *)v;
3221}
3222
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003223PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003224 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003225 const char *errors)
3226{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003227 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3228}
3229
3230/*
3231 * Convert unicode into string object (MBCS).
3232 * Returns 0 if succeed, -1 otherwise.
3233 */
3234static int encode_mbcs(PyObject **repr,
3235 const Py_UNICODE *p, /* unicode */
3236 int size) /* size of unicode */
3237{
3238 int mbcssize = 0;
3239 Py_ssize_t n = 0;
3240
3241 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003242
3243 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003244 if (size > 0) {
3245 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3246 if (mbcssize == 0) {
3247 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3248 return -1;
3249 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003250 }
3251
Martin v. Löwisd8251432006-06-14 05:21:04 +00003252 if (*repr == NULL) {
3253 /* Create string object */
3254 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3255 if (*repr == NULL)
3256 return -1;
3257 }
3258 else {
3259 /* Extend string object */
3260 n = PyString_Size(*repr);
3261 if (_PyString_Resize(repr, n + mbcssize) < 0)
3262 return -1;
3263 }
3264
3265 /* Do the conversion */
3266 if (size > 0) {
3267 char *s = PyString_AS_STRING(*repr) + n;
3268 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3269 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3270 return -1;
3271 }
3272 }
3273
3274 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003275}
3276
3277PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003278 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003279 const char *errors)
3280{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003281 PyObject *repr = NULL;
3282 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003283
Martin v. Löwisd8251432006-06-14 05:21:04 +00003284#ifdef NEED_RETRY
3285 retry:
3286 if (size > INT_MAX)
3287 ret = encode_mbcs(&repr, p, INT_MAX);
3288 else
3289#endif
3290 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003291
Martin v. Löwisd8251432006-06-14 05:21:04 +00003292 if (ret < 0) {
3293 Py_XDECREF(repr);
3294 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003295 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003296
3297#ifdef NEED_RETRY
3298 if (size > INT_MAX) {
3299 p += INT_MAX;
3300 size -= INT_MAX;
3301 goto retry;
3302 }
3303#endif
3304
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003305 return repr;
3306}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003307
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003308PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3309{
3310 if (!PyUnicode_Check(unicode)) {
3311 PyErr_BadArgument();
3312 return NULL;
3313 }
3314 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3315 PyUnicode_GET_SIZE(unicode),
3316 NULL);
3317}
3318
Martin v. Löwisd8251432006-06-14 05:21:04 +00003319#undef NEED_RETRY
3320
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003321#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003322
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323/* --- Character Mapping Codec -------------------------------------------- */
3324
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003326 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 PyObject *mapping,
3328 const char *errors)
3329{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003331 Py_ssize_t startinpos;
3332 Py_ssize_t endinpos;
3333 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003334 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 PyUnicodeObject *v;
3336 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003337 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338 PyObject *errorHandler = NULL;
3339 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003340 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003341 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003342
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 /* Default to Latin-1 */
3344 if (mapping == NULL)
3345 return PyUnicode_DecodeLatin1(s, size, errors);
3346
3347 v = _PyUnicode_New(size);
3348 if (v == NULL)
3349 goto onError;
3350 if (size == 0)
3351 return (PyObject *)v;
3352 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003353 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003354 if (PyUnicode_CheckExact(mapping)) {
3355 mapstring = PyUnicode_AS_UNICODE(mapping);
3356 maplen = PyUnicode_GET_SIZE(mapping);
3357 while (s < e) {
3358 unsigned char ch = *s;
3359 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003361 if (ch < maplen)
3362 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003364 if (x == 0xfffe) {
3365 /* undefined mapping */
3366 outpos = p-PyUnicode_AS_UNICODE(v);
3367 startinpos = s-starts;
3368 endinpos = startinpos+1;
3369 if (unicode_decode_call_errorhandler(
3370 errors, &errorHandler,
3371 "charmap", "character maps to <undefined>",
3372 starts, size, &startinpos, &endinpos, &exc, &s,
3373 (PyObject **)&v, &outpos, &p)) {
3374 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003375 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003376 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003377 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003378 *p++ = x;
3379 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003381 }
3382 else {
3383 while (s < e) {
3384 unsigned char ch = *s;
3385 PyObject *w, *x;
3386
3387 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3388 w = PyInt_FromLong((long)ch);
3389 if (w == NULL)
3390 goto onError;
3391 x = PyObject_GetItem(mapping, w);
3392 Py_DECREF(w);
3393 if (x == NULL) {
3394 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3395 /* No mapping found means: mapping is undefined. */
3396 PyErr_Clear();
3397 x = Py_None;
3398 Py_INCREF(x);
3399 } else
3400 goto onError;
3401 }
3402
3403 /* Apply mapping */
3404 if (PyInt_Check(x)) {
3405 long value = PyInt_AS_LONG(x);
3406 if (value < 0 || value > 65535) {
3407 PyErr_SetString(PyExc_TypeError,
3408 "character mapping must be in range(65536)");
3409 Py_DECREF(x);
3410 goto onError;
3411 }
3412 *p++ = (Py_UNICODE)value;
3413 }
3414 else if (x == Py_None) {
3415 /* undefined mapping */
3416 outpos = p-PyUnicode_AS_UNICODE(v);
3417 startinpos = s-starts;
3418 endinpos = startinpos+1;
3419 if (unicode_decode_call_errorhandler(
3420 errors, &errorHandler,
3421 "charmap", "character maps to <undefined>",
3422 starts, size, &startinpos, &endinpos, &exc, &s,
3423 (PyObject **)&v, &outpos, &p)) {
3424 Py_DECREF(x);
3425 goto onError;
3426 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003427 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003428 continue;
3429 }
3430 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003431 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003432
3433 if (targetsize == 1)
3434 /* 1-1 mapping */
3435 *p++ = *PyUnicode_AS_UNICODE(x);
3436
3437 else if (targetsize > 1) {
3438 /* 1-n mapping */
3439 if (targetsize > extrachars) {
3440 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003441 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3442 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003443 (targetsize << 2);
3444 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00003445 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003446 if (_PyUnicode_Resize(&v,
3447 PyUnicode_GET_SIZE(v) + needed) < 0) {
3448 Py_DECREF(x);
3449 goto onError;
3450 }
3451 p = PyUnicode_AS_UNICODE(v) + oldpos;
3452 }
3453 Py_UNICODE_COPY(p,
3454 PyUnicode_AS_UNICODE(x),
3455 targetsize);
3456 p += targetsize;
3457 extrachars -= targetsize;
3458 }
3459 /* 1-0 mapping: skip the character */
3460 }
3461 else {
3462 /* wrong return value */
3463 PyErr_SetString(PyExc_TypeError,
3464 "character mapping must return integer, None or unicode");
3465 Py_DECREF(x);
3466 goto onError;
3467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003469 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 }
3472 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003473 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475 Py_XDECREF(errorHandler);
3476 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003477 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003478
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003480 Py_XDECREF(errorHandler);
3481 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482 Py_XDECREF(v);
3483 return NULL;
3484}
3485
Martin v. Löwis3f767792006-06-04 19:36:28 +00003486/* Charmap encoding: the lookup table */
3487
3488struct encoding_map{
3489 PyObject_HEAD
3490 unsigned char level1[32];
3491 int count2, count3;
3492 unsigned char level23[1];
3493};
3494
3495static PyObject*
3496encoding_map_size(PyObject *obj, PyObject* args)
3497{
3498 struct encoding_map *map = (struct encoding_map*)obj;
3499 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3500 128*map->count3);
3501}
3502
3503static PyMethodDef encoding_map_methods[] = {
3504 {"size", encoding_map_size, METH_NOARGS,
3505 PyDoc_STR("Return the size (in bytes) of this object") },
3506 { 0 }
3507};
3508
3509static void
3510encoding_map_dealloc(PyObject* o)
3511{
3512 PyObject_FREE(o);
3513}
3514
3515static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00003516 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003517 "EncodingMap", /*tp_name*/
3518 sizeof(struct encoding_map), /*tp_basicsize*/
3519 0, /*tp_itemsize*/
3520 /* methods */
3521 encoding_map_dealloc, /*tp_dealloc*/
3522 0, /*tp_print*/
3523 0, /*tp_getattr*/
3524 0, /*tp_setattr*/
3525 0, /*tp_compare*/
3526 0, /*tp_repr*/
3527 0, /*tp_as_number*/
3528 0, /*tp_as_sequence*/
3529 0, /*tp_as_mapping*/
3530 0, /*tp_hash*/
3531 0, /*tp_call*/
3532 0, /*tp_str*/
3533 0, /*tp_getattro*/
3534 0, /*tp_setattro*/
3535 0, /*tp_as_buffer*/
3536 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3537 0, /*tp_doc*/
3538 0, /*tp_traverse*/
3539 0, /*tp_clear*/
3540 0, /*tp_richcompare*/
3541 0, /*tp_weaklistoffset*/
3542 0, /*tp_iter*/
3543 0, /*tp_iternext*/
3544 encoding_map_methods, /*tp_methods*/
3545 0, /*tp_members*/
3546 0, /*tp_getset*/
3547 0, /*tp_base*/
3548 0, /*tp_dict*/
3549 0, /*tp_descr_get*/
3550 0, /*tp_descr_set*/
3551 0, /*tp_dictoffset*/
3552 0, /*tp_init*/
3553 0, /*tp_alloc*/
3554 0, /*tp_new*/
3555 0, /*tp_free*/
3556 0, /*tp_is_gc*/
3557};
3558
3559PyObject*
3560PyUnicode_BuildEncodingMap(PyObject* string)
3561{
3562 Py_UNICODE *decode;
3563 PyObject *result;
3564 struct encoding_map *mresult;
3565 int i;
3566 int need_dict = 0;
3567 unsigned char level1[32];
3568 unsigned char level2[512];
3569 unsigned char *mlevel1, *mlevel2, *mlevel3;
3570 int count2 = 0, count3 = 0;
3571
3572 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3573 PyErr_BadArgument();
3574 return NULL;
3575 }
3576 decode = PyUnicode_AS_UNICODE(string);
3577 memset(level1, 0xFF, sizeof level1);
3578 memset(level2, 0xFF, sizeof level2);
3579
3580 /* If there isn't a one-to-one mapping of NULL to \0,
3581 or if there are non-BMP characters, we need to use
3582 a mapping dictionary. */
3583 if (decode[0] != 0)
3584 need_dict = 1;
3585 for (i = 1; i < 256; i++) {
3586 int l1, l2;
3587 if (decode[i] == 0
3588 #ifdef Py_UNICODE_WIDE
3589 || decode[i] > 0xFFFF
3590 #endif
3591 ) {
3592 need_dict = 1;
3593 break;
3594 }
3595 if (decode[i] == 0xFFFE)
3596 /* unmapped character */
3597 continue;
3598 l1 = decode[i] >> 11;
3599 l2 = decode[i] >> 7;
3600 if (level1[l1] == 0xFF)
3601 level1[l1] = count2++;
3602 if (level2[l2] == 0xFF)
3603 level2[l2] = count3++;
3604 }
3605
3606 if (count2 >= 0xFF || count3 >= 0xFF)
3607 need_dict = 1;
3608
3609 if (need_dict) {
3610 PyObject *result = PyDict_New();
3611 PyObject *key, *value;
3612 if (!result)
3613 return NULL;
3614 for (i = 0; i < 256; i++) {
3615 key = value = NULL;
3616 key = PyInt_FromLong(decode[i]);
3617 value = PyInt_FromLong(i);
3618 if (!key || !value)
3619 goto failed1;
3620 if (PyDict_SetItem(result, key, value) == -1)
3621 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003622 Py_DECREF(key);
3623 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003624 }
3625 return result;
3626 failed1:
3627 Py_XDECREF(key);
3628 Py_XDECREF(value);
3629 Py_DECREF(result);
3630 return NULL;
3631 }
3632
3633 /* Create a three-level trie */
3634 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3635 16*count2 + 128*count3 - 1);
3636 if (!result)
3637 return PyErr_NoMemory();
3638 PyObject_Init(result, &EncodingMapType);
3639 mresult = (struct encoding_map*)result;
3640 mresult->count2 = count2;
3641 mresult->count3 = count3;
3642 mlevel1 = mresult->level1;
3643 mlevel2 = mresult->level23;
3644 mlevel3 = mresult->level23 + 16*count2;
3645 memcpy(mlevel1, level1, 32);
3646 memset(mlevel2, 0xFF, 16*count2);
3647 memset(mlevel3, 0, 128*count3);
3648 count3 = 0;
3649 for (i = 1; i < 256; i++) {
3650 int o1, o2, o3, i2, i3;
3651 if (decode[i] == 0xFFFE)
3652 /* unmapped character */
3653 continue;
3654 o1 = decode[i]>>11;
3655 o2 = (decode[i]>>7) & 0xF;
3656 i2 = 16*mlevel1[o1] + o2;
3657 if (mlevel2[i2] == 0xFF)
3658 mlevel2[i2] = count3++;
3659 o3 = decode[i] & 0x7F;
3660 i3 = 128*mlevel2[i2] + o3;
3661 mlevel3[i3] = i;
3662 }
3663 return result;
3664}
3665
3666static int
3667encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3668{
3669 struct encoding_map *map = (struct encoding_map*)mapping;
3670 int l1 = c>>11;
3671 int l2 = (c>>7) & 0xF;
3672 int l3 = c & 0x7F;
3673 int i;
3674
3675#ifdef Py_UNICODE_WIDE
3676 if (c > 0xFFFF) {
3677 return -1;
3678 }
3679#endif
3680 if (c == 0)
3681 return 0;
3682 /* level 1*/
3683 i = map->level1[l1];
3684 if (i == 0xFF) {
3685 return -1;
3686 }
3687 /* level 2*/
3688 i = map->level23[16*i+l2];
3689 if (i == 0xFF) {
3690 return -1;
3691 }
3692 /* level 3 */
3693 i = map->level23[16*map->count2 + 128*i + l3];
3694 if (i == 0) {
3695 return -1;
3696 }
3697 return i;
3698}
3699
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003700/* Lookup the character ch in the mapping. If the character
3701 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003702 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003703static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 PyObject *w = PyInt_FromLong((long)c);
3706 PyObject *x;
3707
3708 if (w == NULL)
3709 return NULL;
3710 x = PyObject_GetItem(mapping, w);
3711 Py_DECREF(w);
3712 if (x == NULL) {
3713 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3714 /* No mapping found means: mapping is undefined. */
3715 PyErr_Clear();
3716 x = Py_None;
3717 Py_INCREF(x);
3718 return x;
3719 } else
3720 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003722 else if (x == Py_None)
3723 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003724 else if (PyInt_Check(x)) {
3725 long value = PyInt_AS_LONG(x);
3726 if (value < 0 || value > 255) {
3727 PyErr_SetString(PyExc_TypeError,
3728 "character mapping must be in range(256)");
3729 Py_DECREF(x);
3730 return NULL;
3731 }
3732 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003734 else if (PyString_Check(x))
3735 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003736 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 /* wrong return value */
3738 PyErr_SetString(PyExc_TypeError,
3739 "character mapping must return integer, None or str");
3740 Py_DECREF(x);
3741 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 }
3743}
3744
Martin v. Löwis3f767792006-06-04 19:36:28 +00003745static int
3746charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3747{
3748 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3749 /* exponentially overallocate to minimize reallocations */
3750 if (requiredsize < 2*outsize)
3751 requiredsize = 2*outsize;
3752 if (_PyString_Resize(outobj, requiredsize)) {
3753 return 0;
3754 }
3755 return 1;
3756}
3757
3758typedef enum charmapencode_result {
3759 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3760}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761/* lookup the character, put the result in the output string and adjust
3762 various state variables. Reallocate the output string if not enough
3763 space is available. Return a new reference to the object that
3764 was put in the output buffer, or Py_None, if the mapping was undefined
3765 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003766 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003768charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003769 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003771 PyObject *rep;
3772 char *outstart;
3773 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003774
Martin v. Löwis68192102007-07-21 06:55:02 +00003775 if (Py_Type(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003776 int res = encoding_map_lookup(c, mapping);
3777 Py_ssize_t requiredsize = *outpos+1;
3778 if (res == -1)
3779 return enc_FAILED;
3780 if (outsize<requiredsize)
3781 if (!charmapencode_resize(outobj, outpos, requiredsize))
3782 return enc_EXCEPTION;
3783 outstart = PyString_AS_STRING(*outobj);
3784 outstart[(*outpos)++] = (char)res;
3785 return enc_SUCCESS;
3786 }
3787
3788 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003789 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003790 return enc_EXCEPTION;
3791 else if (rep==Py_None) {
3792 Py_DECREF(rep);
3793 return enc_FAILED;
3794 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003796 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003797 if (outsize<requiredsize)
3798 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003799 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003800 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003801 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003802 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003803 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3804 }
3805 else {
3806 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003807 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3808 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003809 if (outsize<requiredsize)
3810 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003812 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003813 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003814 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 memcpy(outstart + *outpos, repchars, repsize);
3816 *outpos += repsize;
3817 }
3818 }
Georg Brandl9f167602006-06-04 21:46:16 +00003819 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003820 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821}
3822
3823/* handle an error in PyUnicode_EncodeCharmap
3824 Return 0 on success, -1 on error */
3825static
3826int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003827 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003829 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003830 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003831{
3832 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003833 Py_ssize_t repsize;
3834 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 Py_UNICODE *uni2;
3836 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003837 Py_ssize_t collstartpos = *inpos;
3838 Py_ssize_t collendpos = *inpos+1;
3839 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003840 char *encoding = "charmap";
3841 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003842 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003843
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003844 /* find all unencodable characters */
3845 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003846 PyObject *rep;
Martin v. Löwis68192102007-07-21 06:55:02 +00003847 if (Py_Type(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003848 int res = encoding_map_lookup(p[collendpos], mapping);
3849 if (res != -1)
3850 break;
3851 ++collendpos;
3852 continue;
3853 }
3854
3855 rep = charmapencode_lookup(p[collendpos], mapping);
3856 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003858 else if (rep!=Py_None) {
3859 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 break;
3861 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003862 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003863 ++collendpos;
3864 }
3865 /* cache callback name lookup
3866 * (if not done yet, i.e. it's the first error) */
3867 if (*known_errorHandler==-1) {
3868 if ((errors==NULL) || (!strcmp(errors, "strict")))
3869 *known_errorHandler = 1;
3870 else if (!strcmp(errors, "replace"))
3871 *known_errorHandler = 2;
3872 else if (!strcmp(errors, "ignore"))
3873 *known_errorHandler = 3;
3874 else if (!strcmp(errors, "xmlcharrefreplace"))
3875 *known_errorHandler = 4;
3876 else
3877 *known_errorHandler = 0;
3878 }
3879 switch (*known_errorHandler) {
3880 case 1: /* strict */
3881 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3882 return -1;
3883 case 2: /* replace */
3884 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3885 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003886 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003887 return -1;
3888 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003889 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3891 return -1;
3892 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003893 }
3894 /* fall through */
3895 case 3: /* ignore */
3896 *inpos = collendpos;
3897 break;
3898 case 4: /* xmlcharrefreplace */
3899 /* generate replacement (temporarily (mis)uses p) */
3900 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3901 char buffer[2+29+1+1];
3902 char *cp;
3903 sprintf(buffer, "&#%d;", (int)p[collpos]);
3904 for (cp = buffer; *cp; ++cp) {
3905 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003906 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003907 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003908 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003909 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3910 return -1;
3911 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912 }
3913 }
3914 *inpos = collendpos;
3915 break;
3916 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003917 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003918 encoding, reason, p, size, exceptionObject,
3919 collstartpos, collendpos, &newpos);
3920 if (repunicode == NULL)
3921 return -1;
3922 /* generate replacement */
3923 repsize = PyUnicode_GET_SIZE(repunicode);
3924 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3925 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003926 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927 return -1;
3928 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003929 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003930 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3932 return -1;
3933 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934 }
3935 *inpos = newpos;
3936 Py_DECREF(repunicode);
3937 }
3938 return 0;
3939}
3940
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003942 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 PyObject *mapping,
3944 const char *errors)
3945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946 /* output object */
3947 PyObject *res = NULL;
3948 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003949 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003950 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003951 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 PyObject *errorHandler = NULL;
3953 PyObject *exc = NULL;
3954 /* the following variable is used for caching string comparisons
3955 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3956 * 3=ignore, 4=xmlcharrefreplace */
3957 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958
3959 /* Default to Latin-1 */
3960 if (mapping == NULL)
3961 return PyUnicode_EncodeLatin1(p, size, errors);
3962
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003963 /* allocate enough for a simple encoding without
3964 replacements, if we need more, we'll resize */
3965 res = PyString_FromStringAndSize(NULL, size);
3966 if (res == NULL)
3967 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003968 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971 while (inpos<size) {
3972 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003973 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3974 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003976 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977 if (charmap_encoding_error(p, size, &inpos, mapping,
3978 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003979 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003980 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003981 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 else
3985 /* done with this character => adjust input position */
3986 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003989 /* Resize if we allocated to much */
3990 if (respos<PyString_GET_SIZE(res)) {
3991 if (_PyString_Resize(&res, respos))
3992 goto onError;
3993 }
3994 Py_XDECREF(exc);
3995 Py_XDECREF(errorHandler);
3996 return res;
3997
3998 onError:
3999 Py_XDECREF(res);
4000 Py_XDECREF(exc);
4001 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002 return NULL;
4003}
4004
4005PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4006 PyObject *mapping)
4007{
4008 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4009 PyErr_BadArgument();
4010 return NULL;
4011 }
4012 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4013 PyUnicode_GET_SIZE(unicode),
4014 mapping,
4015 NULL);
4016}
4017
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004018/* create or adjust a UnicodeTranslateError */
4019static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004020 const Py_UNICODE *unicode, Py_ssize_t size,
4021 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004022 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 if (*exceptionObject == NULL) {
4025 *exceptionObject = PyUnicodeTranslateError_Create(
4026 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 }
4028 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4030 goto onError;
4031 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4032 goto onError;
4033 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4034 goto onError;
4035 return;
4036 onError:
4037 Py_DECREF(*exceptionObject);
4038 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 }
4040}
4041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042/* raises a UnicodeTranslateError */
4043static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004044 const Py_UNICODE *unicode, Py_ssize_t size,
4045 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004046 const char *reason)
4047{
4048 make_translate_exception(exceptionObject,
4049 unicode, size, startpos, endpos, reason);
4050 if (*exceptionObject != NULL)
4051 PyCodec_StrictErrors(*exceptionObject);
4052}
4053
4054/* error handling callback helper:
4055 build arguments, call the callback and check the arguments,
4056 put the result into newpos and return the replacement string, which
4057 has to be freed by the caller */
4058static PyObject *unicode_translate_call_errorhandler(const char *errors,
4059 PyObject **errorHandler,
4060 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004061 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4062 Py_ssize_t startpos, Py_ssize_t endpos,
4063 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004065 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066
Martin v. Löwis412fb672006-04-13 06:34:32 +00004067 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 PyObject *restuple;
4069 PyObject *resunicode;
4070
4071 if (*errorHandler == NULL) {
4072 *errorHandler = PyCodec_LookupError(errors);
4073 if (*errorHandler == NULL)
4074 return NULL;
4075 }
4076
4077 make_translate_exception(exceptionObject,
4078 unicode, size, startpos, endpos, reason);
4079 if (*exceptionObject == NULL)
4080 return NULL;
4081
4082 restuple = PyObject_CallFunctionObjArgs(
4083 *errorHandler, *exceptionObject, NULL);
4084 if (restuple == NULL)
4085 return NULL;
4086 if (!PyTuple_Check(restuple)) {
4087 PyErr_Format(PyExc_TypeError, &argparse[4]);
4088 Py_DECREF(restuple);
4089 return NULL;
4090 }
4091 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004092 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093 Py_DECREF(restuple);
4094 return NULL;
4095 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004096 if (i_newpos<0)
4097 *newpos = size+i_newpos;
4098 else
4099 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004100 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004101 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004102 Py_DECREF(restuple);
4103 return NULL;
4104 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004105 Py_INCREF(resunicode);
4106 Py_DECREF(restuple);
4107 return resunicode;
4108}
4109
4110/* Lookup the character ch in the mapping and put the result in result,
4111 which must be decrefed by the caller.
4112 Return 0 on success, -1 on error */
4113static
4114int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4115{
4116 PyObject *w = PyInt_FromLong((long)c);
4117 PyObject *x;
4118
4119 if (w == NULL)
4120 return -1;
4121 x = PyObject_GetItem(mapping, w);
4122 Py_DECREF(w);
4123 if (x == NULL) {
4124 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4125 /* No mapping found means: use 1:1 mapping. */
4126 PyErr_Clear();
4127 *result = NULL;
4128 return 0;
4129 } else
4130 return -1;
4131 }
4132 else if (x == Py_None) {
4133 *result = x;
4134 return 0;
4135 }
4136 else if (PyInt_Check(x)) {
4137 long value = PyInt_AS_LONG(x);
4138 long max = PyUnicode_GetMax();
4139 if (value < 0 || value > max) {
4140 PyErr_Format(PyExc_TypeError,
4141 "character mapping must be in range(0x%lx)", max+1);
4142 Py_DECREF(x);
4143 return -1;
4144 }
4145 *result = x;
4146 return 0;
4147 }
4148 else if (PyUnicode_Check(x)) {
4149 *result = x;
4150 return 0;
4151 }
4152 else {
4153 /* wrong return value */
4154 PyErr_SetString(PyExc_TypeError,
4155 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004156 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 return -1;
4158 }
4159}
4160/* ensure that *outobj is at least requiredsize characters long,
4161if not reallocate and adjust various state variables.
4162Return 0 on success, -1 on error */
4163static
Walter Dörwald4894c302003-10-24 14:25:28 +00004164int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004165 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004167 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004168 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004169 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004170 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004172 if (requiredsize < 2 * oldsize)
4173 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004174 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 return -1;
4176 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004177 }
4178 return 0;
4179}
4180/* lookup the character, put the result in the output string and adjust
4181 various state variables. Return a new reference to the object that
4182 was put in the output buffer in *result, or Py_None, if the mapping was
4183 undefined (in which case no character was written).
4184 The called must decref result.
4185 Return 0 on success, -1 on error. */
4186static
Walter Dörwald4894c302003-10-24 14:25:28 +00004187int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004188 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004189 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190{
Walter Dörwald4894c302003-10-24 14:25:28 +00004191 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 return -1;
4193 if (*res==NULL) {
4194 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004195 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 }
4197 else if (*res==Py_None)
4198 ;
4199 else if (PyInt_Check(*res)) {
4200 /* no overflow check, because we know that the space is enough */
4201 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4202 }
4203 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004204 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205 if (repsize==1) {
4206 /* no overflow check, because we know that the space is enough */
4207 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4208 }
4209 else if (repsize!=0) {
4210 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004211 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004212 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004213 repsize - 1;
4214 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215 return -1;
4216 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4217 *outp += repsize;
4218 }
4219 }
4220 else
4221 return -1;
4222 return 0;
4223}
4224
4225PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004226 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227 PyObject *mapping,
4228 const char *errors)
4229{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230 /* output object */
4231 PyObject *res = NULL;
4232 /* pointers to the beginning and end+1 of input */
4233 const Py_UNICODE *startp = p;
4234 const Py_UNICODE *endp = p + size;
4235 /* pointer into the output */
4236 Py_UNICODE *str;
4237 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004238 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239 char *reason = "character maps to <undefined>";
4240 PyObject *errorHandler = NULL;
4241 PyObject *exc = NULL;
4242 /* the following variable is used for caching string comparisons
4243 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4244 * 3=ignore, 4=xmlcharrefreplace */
4245 int known_errorHandler = -1;
4246
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247 if (mapping == NULL) {
4248 PyErr_BadArgument();
4249 return NULL;
4250 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251
4252 /* allocate enough for a simple 1:1 translation without
4253 replacements, if we need more, we'll resize */
4254 res = PyUnicode_FromUnicode(NULL, size);
4255 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004256 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 return res;
4259 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261 while (p<endp) {
4262 /* try to encode it */
4263 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004264 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004265 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 goto onError;
4267 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004268 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269 if (x!=Py_None) /* it worked => adjust input pointer */
4270 ++p;
4271 else { /* untranslatable character */
4272 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004273 Py_ssize_t repsize;
4274 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275 Py_UNICODE *uni2;
4276 /* startpos for collecting untranslatable chars */
4277 const Py_UNICODE *collstart = p;
4278 const Py_UNICODE *collend = p+1;
4279 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281 /* find all untranslatable characters */
4282 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004283 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284 goto onError;
4285 Py_XDECREF(x);
4286 if (x!=Py_None)
4287 break;
4288 ++collend;
4289 }
4290 /* cache callback name lookup
4291 * (if not done yet, i.e. it's the first error) */
4292 if (known_errorHandler==-1) {
4293 if ((errors==NULL) || (!strcmp(errors, "strict")))
4294 known_errorHandler = 1;
4295 else if (!strcmp(errors, "replace"))
4296 known_errorHandler = 2;
4297 else if (!strcmp(errors, "ignore"))
4298 known_errorHandler = 3;
4299 else if (!strcmp(errors, "xmlcharrefreplace"))
4300 known_errorHandler = 4;
4301 else
4302 known_errorHandler = 0;
4303 }
4304 switch (known_errorHandler) {
4305 case 1: /* strict */
4306 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4307 goto onError;
4308 case 2: /* replace */
4309 /* No need to check for space, this is a 1:1 replacement */
4310 for (coll = collstart; coll<collend; ++coll)
4311 *str++ = '?';
4312 /* fall through */
4313 case 3: /* ignore */
4314 p = collend;
4315 break;
4316 case 4: /* xmlcharrefreplace */
4317 /* generate replacement (temporarily (mis)uses p) */
4318 for (p = collstart; p < collend; ++p) {
4319 char buffer[2+29+1+1];
4320 char *cp;
4321 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004322 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004323 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4324 goto onError;
4325 for (cp = buffer; *cp; ++cp)
4326 *str++ = *cp;
4327 }
4328 p = collend;
4329 break;
4330 default:
4331 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4332 reason, startp, size, &exc,
4333 collstart-startp, collend-startp, &newpos);
4334 if (repunicode == NULL)
4335 goto onError;
4336 /* generate replacement */
4337 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004338 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004339 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4340 Py_DECREF(repunicode);
4341 goto onError;
4342 }
4343 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4344 *str++ = *uni2;
4345 p = startp + newpos;
4346 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347 }
4348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004350 /* Resize if we allocated to much */
4351 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004352 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004353 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004354 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355 }
4356 Py_XDECREF(exc);
4357 Py_XDECREF(errorHandler);
4358 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004359
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360 onError:
4361 Py_XDECREF(res);
4362 Py_XDECREF(exc);
4363 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 return NULL;
4365}
4366
4367PyObject *PyUnicode_Translate(PyObject *str,
4368 PyObject *mapping,
4369 const char *errors)
4370{
4371 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004372
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373 str = PyUnicode_FromObject(str);
4374 if (str == NULL)
4375 goto onError;
4376 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4377 PyUnicode_GET_SIZE(str),
4378 mapping,
4379 errors);
4380 Py_DECREF(str);
4381 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004382
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 onError:
4384 Py_XDECREF(str);
4385 return NULL;
4386}
Tim Petersced69f82003-09-16 20:30:58 +00004387
Guido van Rossum9e896b32000-04-05 20:11:21 +00004388/* --- Decimal Encoder ---------------------------------------------------- */
4389
4390int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004391 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004392 char *output,
4393 const char *errors)
4394{
4395 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 PyObject *errorHandler = NULL;
4397 PyObject *exc = NULL;
4398 const char *encoding = "decimal";
4399 const char *reason = "invalid decimal Unicode string";
4400 /* the following variable is used for caching string comparisons
4401 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4402 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004403
4404 if (output == NULL) {
4405 PyErr_BadArgument();
4406 return -1;
4407 }
4408
4409 p = s;
4410 end = s + length;
4411 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004413 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004415 Py_ssize_t repsize;
4416 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 Py_UNICODE *uni2;
4418 Py_UNICODE *collstart;
4419 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004420
Guido van Rossum9e896b32000-04-05 20:11:21 +00004421 if (Py_UNICODE_ISSPACE(ch)) {
4422 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004424 continue;
4425 }
4426 decimal = Py_UNICODE_TODECIMAL(ch);
4427 if (decimal >= 0) {
4428 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004430 continue;
4431 }
Guido van Rossumba477042000-04-06 18:18:10 +00004432 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004433 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004435 continue;
4436 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437 /* All other characters are considered unencodable */
4438 collstart = p;
4439 collend = p+1;
4440 while (collend < end) {
4441 if ((0 < *collend && *collend < 256) ||
4442 !Py_UNICODE_ISSPACE(*collend) ||
4443 Py_UNICODE_TODECIMAL(*collend))
4444 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004445 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 /* cache callback name lookup
4447 * (if not done yet, i.e. it's the first error) */
4448 if (known_errorHandler==-1) {
4449 if ((errors==NULL) || (!strcmp(errors, "strict")))
4450 known_errorHandler = 1;
4451 else if (!strcmp(errors, "replace"))
4452 known_errorHandler = 2;
4453 else if (!strcmp(errors, "ignore"))
4454 known_errorHandler = 3;
4455 else if (!strcmp(errors, "xmlcharrefreplace"))
4456 known_errorHandler = 4;
4457 else
4458 known_errorHandler = 0;
4459 }
4460 switch (known_errorHandler) {
4461 case 1: /* strict */
4462 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4463 goto onError;
4464 case 2: /* replace */
4465 for (p = collstart; p < collend; ++p)
4466 *output++ = '?';
4467 /* fall through */
4468 case 3: /* ignore */
4469 p = collend;
4470 break;
4471 case 4: /* xmlcharrefreplace */
4472 /* generate replacement (temporarily (mis)uses p) */
4473 for (p = collstart; p < collend; ++p)
4474 output += sprintf(output, "&#%d;", (int)*p);
4475 p = collend;
4476 break;
4477 default:
4478 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4479 encoding, reason, s, length, &exc,
4480 collstart-s, collend-s, &newpos);
4481 if (repunicode == NULL)
4482 goto onError;
4483 /* generate replacement */
4484 repsize = PyUnicode_GET_SIZE(repunicode);
4485 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4486 Py_UNICODE ch = *uni2;
4487 if (Py_UNICODE_ISSPACE(ch))
4488 *output++ = ' ';
4489 else {
4490 decimal = Py_UNICODE_TODECIMAL(ch);
4491 if (decimal >= 0)
4492 *output++ = '0' + decimal;
4493 else if (0 < ch && ch < 256)
4494 *output++ = (char)ch;
4495 else {
4496 Py_DECREF(repunicode);
4497 raise_encode_exception(&exc, encoding,
4498 s, length, collstart-s, collend-s, reason);
4499 goto onError;
4500 }
4501 }
4502 }
4503 p = s + newpos;
4504 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004505 }
4506 }
4507 /* 0-terminate the output string */
4508 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 Py_XDECREF(exc);
4510 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004511 return 0;
4512
4513 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 Py_XDECREF(exc);
4515 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004516 return -1;
4517}
4518
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519/* --- Helpers ------------------------------------------------------------ */
4520
Fredrik Lundha50d2012006-05-26 17:04:58 +00004521#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004522
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004523#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004524#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004525#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004526
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004527Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004528STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4529{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004530 if (str[0] != other[0])
4531 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004532 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4533}
4534
Fredrik Lundhb9479482006-05-26 17:22:38 +00004535#define STRINGLIB_EMPTY unicode_empty
4536
Fredrik Lundha50d2012006-05-26 17:04:58 +00004537#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004538
4539#include "stringlib/count.h"
4540#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004541#include "stringlib/partition.h"
4542
Fredrik Lundhc8162812006-05-26 19:33:03 +00004543/* helper macro to fixup start/end slice values */
4544#define FIX_START_END(obj) \
4545 if (start < 0) \
4546 start += (obj)->length; \
4547 if (start < 0) \
4548 start = 0; \
4549 if (end > (obj)->length) \
4550 end = (obj)->length; \
4551 if (end < 0) \
4552 end += (obj)->length; \
4553 if (end < 0) \
4554 end = 0;
4555
Martin v. Löwis18e16552006-02-15 17:27:45 +00004556Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004557 PyObject *substr,
4558 Py_ssize_t start,
4559 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004561 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004562 PyUnicodeObject* str_obj;
4563 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004564
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004565 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4566 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004568 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4569 if (!sub_obj) {
4570 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571 return -1;
4572 }
Tim Petersced69f82003-09-16 20:30:58 +00004573
Fredrik Lundhc8162812006-05-26 19:33:03 +00004574 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004575
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004576 result = stringlib_count(
4577 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4578 );
4579
4580 Py_DECREF(sub_obj);
4581 Py_DECREF(str_obj);
4582
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583 return result;
4584}
4585
Martin v. Löwis18e16552006-02-15 17:27:45 +00004586Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004587 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004588 Py_ssize_t start,
4589 Py_ssize_t end,
4590 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004592 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004593
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004594 str = PyUnicode_FromObject(str);
4595 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004596 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004597 sub = PyUnicode_FromObject(sub);
4598 if (!sub) {
4599 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004600 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601 }
Tim Petersced69f82003-09-16 20:30:58 +00004602
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004603 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004604 result = stringlib_find_slice(
4605 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4606 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4607 start, end
4608 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004609 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004610 result = stringlib_rfind_slice(
4611 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4612 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4613 start, end
4614 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004615
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004616 Py_DECREF(str);
4617 Py_DECREF(sub);
4618
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619 return result;
4620}
4621
Tim Petersced69f82003-09-16 20:30:58 +00004622static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623int tailmatch(PyUnicodeObject *self,
4624 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004625 Py_ssize_t start,
4626 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627 int direction)
4628{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629 if (substring->length == 0)
4630 return 1;
4631
Fredrik Lundhc8162812006-05-26 19:33:03 +00004632 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633
4634 end -= substring->length;
4635 if (end < start)
4636 return 0;
4637
4638 if (direction > 0) {
4639 if (Py_UNICODE_MATCH(self, end, substring))
4640 return 1;
4641 } else {
4642 if (Py_UNICODE_MATCH(self, start, substring))
4643 return 1;
4644 }
4645
4646 return 0;
4647}
4648
Martin v. Löwis18e16552006-02-15 17:27:45 +00004649Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004651 Py_ssize_t start,
4652 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 int direction)
4654{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004655 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004656
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657 str = PyUnicode_FromObject(str);
4658 if (str == NULL)
4659 return -1;
4660 substr = PyUnicode_FromObject(substr);
4661 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004662 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 return -1;
4664 }
Tim Petersced69f82003-09-16 20:30:58 +00004665
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666 result = tailmatch((PyUnicodeObject *)str,
4667 (PyUnicodeObject *)substr,
4668 start, end, direction);
4669 Py_DECREF(str);
4670 Py_DECREF(substr);
4671 return result;
4672}
4673
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674/* Apply fixfct filter to the Unicode object self and return a
4675 reference to the modified object */
4676
Tim Petersced69f82003-09-16 20:30:58 +00004677static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678PyObject *fixup(PyUnicodeObject *self,
4679 int (*fixfct)(PyUnicodeObject *s))
4680{
4681
4682 PyUnicodeObject *u;
4683
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004684 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 if (u == NULL)
4686 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004687
4688 Py_UNICODE_COPY(u->str, self->str, self->length);
4689
Tim Peters7a29bd52001-09-12 03:03:31 +00004690 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004691 /* fixfct should return TRUE if it modified the buffer. If
4692 FALSE, return a reference to the original buffer instead
4693 (to save space, not time) */
4694 Py_INCREF(self);
4695 Py_DECREF(u);
4696 return (PyObject*) self;
4697 }
4698 return (PyObject*) u;
4699}
4700
Tim Petersced69f82003-09-16 20:30:58 +00004701static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702int fixupper(PyUnicodeObject *self)
4703{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004704 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705 Py_UNICODE *s = self->str;
4706 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004707
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708 while (len-- > 0) {
4709 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004710
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711 ch = Py_UNICODE_TOUPPER(*s);
4712 if (ch != *s) {
4713 status = 1;
4714 *s = ch;
4715 }
4716 s++;
4717 }
4718
4719 return status;
4720}
4721
Tim Petersced69f82003-09-16 20:30:58 +00004722static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723int fixlower(PyUnicodeObject *self)
4724{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004725 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726 Py_UNICODE *s = self->str;
4727 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004728
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 while (len-- > 0) {
4730 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004731
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 ch = Py_UNICODE_TOLOWER(*s);
4733 if (ch != *s) {
4734 status = 1;
4735 *s = ch;
4736 }
4737 s++;
4738 }
4739
4740 return status;
4741}
4742
Tim Petersced69f82003-09-16 20:30:58 +00004743static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744int fixswapcase(PyUnicodeObject *self)
4745{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004746 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 Py_UNICODE *s = self->str;
4748 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004749
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 while (len-- > 0) {
4751 if (Py_UNICODE_ISUPPER(*s)) {
4752 *s = Py_UNICODE_TOLOWER(*s);
4753 status = 1;
4754 } else if (Py_UNICODE_ISLOWER(*s)) {
4755 *s = Py_UNICODE_TOUPPER(*s);
4756 status = 1;
4757 }
4758 s++;
4759 }
4760
4761 return status;
4762}
4763
Tim Petersced69f82003-09-16 20:30:58 +00004764static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765int fixcapitalize(PyUnicodeObject *self)
4766{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004767 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004768 Py_UNICODE *s = self->str;
4769 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004770
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004771 if (len == 0)
4772 return 0;
4773 if (Py_UNICODE_ISLOWER(*s)) {
4774 *s = Py_UNICODE_TOUPPER(*s);
4775 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004777 s++;
4778 while (--len > 0) {
4779 if (Py_UNICODE_ISUPPER(*s)) {
4780 *s = Py_UNICODE_TOLOWER(*s);
4781 status = 1;
4782 }
4783 s++;
4784 }
4785 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786}
4787
4788static
4789int fixtitle(PyUnicodeObject *self)
4790{
4791 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4792 register Py_UNICODE *e;
4793 int previous_is_cased;
4794
4795 /* Shortcut for single character strings */
4796 if (PyUnicode_GET_SIZE(self) == 1) {
4797 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4798 if (*p != ch) {
4799 *p = ch;
4800 return 1;
4801 }
4802 else
4803 return 0;
4804 }
Tim Petersced69f82003-09-16 20:30:58 +00004805
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 e = p + PyUnicode_GET_SIZE(self);
4807 previous_is_cased = 0;
4808 for (; p < e; p++) {
4809 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004810
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 if (previous_is_cased)
4812 *p = Py_UNICODE_TOLOWER(ch);
4813 else
4814 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004815
4816 if (Py_UNICODE_ISLOWER(ch) ||
4817 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818 Py_UNICODE_ISTITLE(ch))
4819 previous_is_cased = 1;
4820 else
4821 previous_is_cased = 0;
4822 }
4823 return 1;
4824}
4825
Tim Peters8ce9f162004-08-27 01:49:32 +00004826PyObject *
4827PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828{
Tim Peters8ce9f162004-08-27 01:49:32 +00004829 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004830 const Py_UNICODE blank = ' ';
4831 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004832 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004833 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004834 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4835 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004836 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4837 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004838 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004839 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004840 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841
Tim Peters05eba1f2004-08-27 21:32:02 +00004842 fseq = PySequence_Fast(seq, "");
4843 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004844 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004845 }
4846
Tim Peters91879ab2004-08-27 22:35:44 +00004847 /* Grrrr. A codec may be invoked to convert str objects to
4848 * Unicode, and so it's possible to call back into Python code
4849 * during PyUnicode_FromObject(), and so it's possible for a sick
4850 * codec to change the size of fseq (if seq is a list). Therefore
4851 * we have to keep refetching the size -- can't assume seqlen
4852 * is invariant.
4853 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004854 seqlen = PySequence_Fast_GET_SIZE(fseq);
4855 /* If empty sequence, return u"". */
4856 if (seqlen == 0) {
4857 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4858 goto Done;
4859 }
4860 /* If singleton sequence with an exact Unicode, return that. */
4861 if (seqlen == 1) {
4862 item = PySequence_Fast_GET_ITEM(fseq, 0);
4863 if (PyUnicode_CheckExact(item)) {
4864 Py_INCREF(item);
4865 res = (PyUnicodeObject *)item;
4866 goto Done;
4867 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004868 }
4869
Tim Peters05eba1f2004-08-27 21:32:02 +00004870 /* At least two items to join, or one that isn't exact Unicode. */
4871 if (seqlen > 1) {
4872 /* Set up sep and seplen -- they're needed. */
4873 if (separator == NULL) {
4874 sep = &blank;
4875 seplen = 1;
4876 }
4877 else {
4878 internal_separator = PyUnicode_FromObject(separator);
4879 if (internal_separator == NULL)
4880 goto onError;
4881 sep = PyUnicode_AS_UNICODE(internal_separator);
4882 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004883 /* In case PyUnicode_FromObject() mutated seq. */
4884 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004885 }
4886 }
4887
4888 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004889 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004890 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004891 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004892 res_p = PyUnicode_AS_UNICODE(res);
4893 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004894
Tim Peters05eba1f2004-08-27 21:32:02 +00004895 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004896 Py_ssize_t itemlen;
4897 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004898
4899 item = PySequence_Fast_GET_ITEM(fseq, i);
4900 /* Convert item to Unicode. */
4901 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4902 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004903 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004904 " %.80s found",
Martin v. Löwis68192102007-07-21 06:55:02 +00004905 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004906 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004907 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004908 item = PyUnicode_FromObject(item);
4909 if (item == NULL)
4910 goto onError;
4911 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004912
Tim Peters91879ab2004-08-27 22:35:44 +00004913 /* In case PyUnicode_FromObject() mutated seq. */
4914 seqlen = PySequence_Fast_GET_SIZE(fseq);
4915
Tim Peters8ce9f162004-08-27 01:49:32 +00004916 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004918 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004919 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004920 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004921 if (i < seqlen - 1) {
4922 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004923 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004924 goto Overflow;
4925 }
4926 if (new_res_used > res_alloc) {
4927 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004928 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004929 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004930 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004931 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004932 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004933 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004934 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004936 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004937 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004939
4940 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004941 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004942 res_p += itemlen;
4943 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004944 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004945 res_p += seplen;
4946 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004948 res_used = new_res_used;
4949 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004950
Tim Peters05eba1f2004-08-27 21:32:02 +00004951 /* Shrink res to match the used area; this probably can't fail,
4952 * but it's cheap to check.
4953 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004954 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004955 goto onError;
4956
4957 Done:
4958 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004959 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960 return (PyObject *)res;
4961
Tim Peters8ce9f162004-08-27 01:49:32 +00004962 Overflow:
4963 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004964 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004965 Py_DECREF(item);
4966 /* fall through */
4967
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004969 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004970 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004971 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972 return NULL;
4973}
4974
Tim Petersced69f82003-09-16 20:30:58 +00004975static
4976PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004977 Py_ssize_t left,
4978 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979 Py_UNICODE fill)
4980{
4981 PyUnicodeObject *u;
4982
4983 if (left < 0)
4984 left = 0;
4985 if (right < 0)
4986 right = 0;
4987
Tim Peters7a29bd52001-09-12 03:03:31 +00004988 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989 Py_INCREF(self);
4990 return self;
4991 }
4992
4993 u = _PyUnicode_New(left + self->length + right);
4994 if (u) {
4995 if (left)
4996 Py_UNICODE_FILL(u->str, fill, left);
4997 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4998 if (right)
4999 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5000 }
5001
5002 return u;
5003}
5004
5005#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005006 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007 if (!str) \
5008 goto onError; \
5009 if (PyList_Append(list, str)) { \
5010 Py_DECREF(str); \
5011 goto onError; \
5012 } \
5013 else \
5014 Py_DECREF(str);
5015
5016static
5017PyObject *split_whitespace(PyUnicodeObject *self,
5018 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005019 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005021 register Py_ssize_t i;
5022 register Py_ssize_t j;
5023 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024 PyObject *str;
5025
5026 for (i = j = 0; i < len; ) {
5027 /* find a token */
5028 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5029 i++;
5030 j = i;
5031 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5032 i++;
5033 if (j < i) {
5034 if (maxcount-- <= 0)
5035 break;
5036 SPLIT_APPEND(self->str, j, i);
5037 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5038 i++;
5039 j = i;
5040 }
5041 }
5042 if (j < len) {
5043 SPLIT_APPEND(self->str, j, len);
5044 }
5045 return list;
5046
5047 onError:
5048 Py_DECREF(list);
5049 return NULL;
5050}
5051
5052PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005053 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005055 register Py_ssize_t i;
5056 register Py_ssize_t j;
5057 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058 PyObject *list;
5059 PyObject *str;
5060 Py_UNICODE *data;
5061
5062 string = PyUnicode_FromObject(string);
5063 if (string == NULL)
5064 return NULL;
5065 data = PyUnicode_AS_UNICODE(string);
5066 len = PyUnicode_GET_SIZE(string);
5067
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068 list = PyList_New(0);
5069 if (!list)
5070 goto onError;
5071
5072 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005073 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005074
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005076 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078
5079 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005080 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081 if (i < len) {
5082 if (data[i] == '\r' && i + 1 < len &&
5083 data[i+1] == '\n')
5084 i += 2;
5085 else
5086 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005087 if (keepends)
5088 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 }
Guido van Rossum86662912000-04-11 15:38:46 +00005090 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 j = i;
5092 }
5093 if (j < len) {
5094 SPLIT_APPEND(data, j, len);
5095 }
5096
5097 Py_DECREF(string);
5098 return list;
5099
5100 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005101 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 Py_DECREF(string);
5103 return NULL;
5104}
5105
Tim Petersced69f82003-09-16 20:30:58 +00005106static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107PyObject *split_char(PyUnicodeObject *self,
5108 PyObject *list,
5109 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005110 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005112 register Py_ssize_t i;
5113 register Py_ssize_t j;
5114 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115 PyObject *str;
5116
5117 for (i = j = 0; i < len; ) {
5118 if (self->str[i] == ch) {
5119 if (maxcount-- <= 0)
5120 break;
5121 SPLIT_APPEND(self->str, j, i);
5122 i = j = i + 1;
5123 } else
5124 i++;
5125 }
5126 if (j <= len) {
5127 SPLIT_APPEND(self->str, j, len);
5128 }
5129 return list;
5130
5131 onError:
5132 Py_DECREF(list);
5133 return NULL;
5134}
5135
Tim Petersced69f82003-09-16 20:30:58 +00005136static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137PyObject *split_substring(PyUnicodeObject *self,
5138 PyObject *list,
5139 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005140 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005142 register Py_ssize_t i;
5143 register Py_ssize_t j;
5144 Py_ssize_t len = self->length;
5145 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 PyObject *str;
5147
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005148 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 if (Py_UNICODE_MATCH(self, i, substring)) {
5150 if (maxcount-- <= 0)
5151 break;
5152 SPLIT_APPEND(self->str, j, i);
5153 i = j = i + sublen;
5154 } else
5155 i++;
5156 }
5157 if (j <= len) {
5158 SPLIT_APPEND(self->str, j, len);
5159 }
5160 return list;
5161
5162 onError:
5163 Py_DECREF(list);
5164 return NULL;
5165}
5166
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005167static
5168PyObject *rsplit_whitespace(PyUnicodeObject *self,
5169 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005170 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005171{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005172 register Py_ssize_t i;
5173 register Py_ssize_t j;
5174 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005175 PyObject *str;
5176
5177 for (i = j = len - 1; i >= 0; ) {
5178 /* find a token */
5179 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5180 i--;
5181 j = i;
5182 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5183 i--;
5184 if (j > i) {
5185 if (maxcount-- <= 0)
5186 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005187 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005188 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5189 i--;
5190 j = i;
5191 }
5192 }
5193 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005194 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005195 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005196 if (PyList_Reverse(list) < 0)
5197 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005198 return list;
5199
5200 onError:
5201 Py_DECREF(list);
5202 return NULL;
5203}
5204
5205static
5206PyObject *rsplit_char(PyUnicodeObject *self,
5207 PyObject *list,
5208 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005209 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005210{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005211 register Py_ssize_t i;
5212 register Py_ssize_t j;
5213 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005214 PyObject *str;
5215
5216 for (i = j = len - 1; i >= 0; ) {
5217 if (self->str[i] == ch) {
5218 if (maxcount-- <= 0)
5219 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005220 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005221 j = i = i - 1;
5222 } else
5223 i--;
5224 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005225 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005226 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005227 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005228 if (PyList_Reverse(list) < 0)
5229 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005230 return list;
5231
5232 onError:
5233 Py_DECREF(list);
5234 return NULL;
5235}
5236
5237static
5238PyObject *rsplit_substring(PyUnicodeObject *self,
5239 PyObject *list,
5240 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005241 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005242{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005243 register Py_ssize_t i;
5244 register Py_ssize_t j;
5245 Py_ssize_t len = self->length;
5246 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005247 PyObject *str;
5248
5249 for (i = len - sublen, j = len; i >= 0; ) {
5250 if (Py_UNICODE_MATCH(self, i, substring)) {
5251 if (maxcount-- <= 0)
5252 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005253 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005254 j = i;
5255 i -= sublen;
5256 } else
5257 i--;
5258 }
5259 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005260 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005261 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005262 if (PyList_Reverse(list) < 0)
5263 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005264 return list;
5265
5266 onError:
5267 Py_DECREF(list);
5268 return NULL;
5269}
5270
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271#undef SPLIT_APPEND
5272
5273static
5274PyObject *split(PyUnicodeObject *self,
5275 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005276 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277{
5278 PyObject *list;
5279
5280 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005281 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282
5283 list = PyList_New(0);
5284 if (!list)
5285 return NULL;
5286
5287 if (substring == NULL)
5288 return split_whitespace(self,list,maxcount);
5289
5290 else if (substring->length == 1)
5291 return split_char(self,list,substring->str[0],maxcount);
5292
5293 else if (substring->length == 0) {
5294 Py_DECREF(list);
5295 PyErr_SetString(PyExc_ValueError, "empty separator");
5296 return NULL;
5297 }
5298 else
5299 return split_substring(self,list,substring,maxcount);
5300}
5301
Tim Petersced69f82003-09-16 20:30:58 +00005302static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005303PyObject *rsplit(PyUnicodeObject *self,
5304 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005305 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005306{
5307 PyObject *list;
5308
5309 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005310 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005311
5312 list = PyList_New(0);
5313 if (!list)
5314 return NULL;
5315
5316 if (substring == NULL)
5317 return rsplit_whitespace(self,list,maxcount);
5318
5319 else if (substring->length == 1)
5320 return rsplit_char(self,list,substring->str[0],maxcount);
5321
5322 else if (substring->length == 0) {
5323 Py_DECREF(list);
5324 PyErr_SetString(PyExc_ValueError, "empty separator");
5325 return NULL;
5326 }
5327 else
5328 return rsplit_substring(self,list,substring,maxcount);
5329}
5330
5331static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332PyObject *replace(PyUnicodeObject *self,
5333 PyUnicodeObject *str1,
5334 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005335 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336{
5337 PyUnicodeObject *u;
5338
5339 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005340 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341
Fredrik Lundh347ee272006-05-24 16:35:18 +00005342 if (str1->length == str2->length) {
5343 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005344 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005345 if (str1->length == 1) {
5346 /* replace characters */
5347 Py_UNICODE u1, u2;
5348 if (!findchar(self->str, self->length, str1->str[0]))
5349 goto nothing;
5350 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5351 if (!u)
5352 return NULL;
5353 Py_UNICODE_COPY(u->str, self->str, self->length);
5354 u1 = str1->str[0];
5355 u2 = str2->str[0];
5356 for (i = 0; i < u->length; i++)
5357 if (u->str[i] == u1) {
5358 if (--maxcount < 0)
5359 break;
5360 u->str[i] = u2;
5361 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005363 i = fastsearch(
5364 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005366 if (i < 0)
5367 goto nothing;
5368 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5369 if (!u)
5370 return NULL;
5371 Py_UNICODE_COPY(u->str, self->str, self->length);
5372 while (i <= self->length - str1->length)
5373 if (Py_UNICODE_MATCH(self, i, str1)) {
5374 if (--maxcount < 0)
5375 break;
5376 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5377 i += str1->length;
5378 } else
5379 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005382
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005383 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005384 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 Py_UNICODE *p;
5386
5387 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005388 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 if (n > maxcount)
5390 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005391 if (n == 0)
5392 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005393 /* new_size = self->length + n * (str2->length - str1->length)); */
5394 delta = (str2->length - str1->length);
5395 if (delta == 0) {
5396 new_size = self->length;
5397 } else {
5398 product = n * (str2->length - str1->length);
5399 if ((product / (str2->length - str1->length)) != n) {
5400 PyErr_SetString(PyExc_OverflowError,
5401 "replace string is too long");
5402 return NULL;
5403 }
5404 new_size = self->length + product;
5405 if (new_size < 0) {
5406 PyErr_SetString(PyExc_OverflowError,
5407 "replace string is too long");
5408 return NULL;
5409 }
5410 }
5411 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005412 if (!u)
5413 return NULL;
5414 i = 0;
5415 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005416 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005417 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005418 while (n-- > 0) {
5419 /* look for next match */
5420 j = i;
5421 while (j <= e) {
5422 if (Py_UNICODE_MATCH(self, j, str1))
5423 break;
5424 j++;
5425 }
5426 if (j > i) {
5427 if (j > e)
5428 break;
5429 /* copy unchanged part [i:j] */
5430 Py_UNICODE_COPY(p, self->str+i, j-i);
5431 p += j - i;
5432 }
5433 /* copy substitution string */
5434 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005435 Py_UNICODE_COPY(p, str2->str, str2->length);
5436 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005437 }
5438 i = j + str1->length;
5439 }
5440 if (i < self->length)
5441 /* copy tail [i:] */
5442 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005443 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005444 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005445 while (n > 0) {
5446 Py_UNICODE_COPY(p, str2->str, str2->length);
5447 p += str2->length;
5448 if (--n <= 0)
5449 break;
5450 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005452 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 }
5454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005456
5457nothing:
5458 /* nothing to replace; return original string (when possible) */
5459 if (PyUnicode_CheckExact(self)) {
5460 Py_INCREF(self);
5461 return (PyObject *) self;
5462 }
5463 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464}
5465
5466/* --- Unicode Object Methods --------------------------------------------- */
5467
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005468PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469"S.title() -> unicode\n\
5470\n\
5471Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005472characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473
5474static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005475unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 return fixup(self, fixtitle);
5478}
5479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005480PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481"S.capitalize() -> unicode\n\
5482\n\
5483Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005484have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485
5486static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005487unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 return fixup(self, fixcapitalize);
5490}
5491
5492#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005493PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494"S.capwords() -> unicode\n\
5495\n\
5496Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005497normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498
5499static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005500unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501{
5502 PyObject *list;
5503 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005504 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 /* Split into words */
5507 list = split(self, NULL, -1);
5508 if (!list)
5509 return NULL;
5510
5511 /* Capitalize each word */
5512 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5513 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5514 fixcapitalize);
5515 if (item == NULL)
5516 goto onError;
5517 Py_DECREF(PyList_GET_ITEM(list, i));
5518 PyList_SET_ITEM(list, i, item);
5519 }
5520
5521 /* Join the words to form a new string */
5522 item = PyUnicode_Join(NULL, list);
5523
5524onError:
5525 Py_DECREF(list);
5526 return (PyObject *)item;
5527}
5528#endif
5529
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005530/* Argument converter. Coerces to a single unicode character */
5531
5532static int
5533convert_uc(PyObject *obj, void *addr)
5534{
5535 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5536 PyObject *uniobj;
5537 Py_UNICODE *unistr;
5538
5539 uniobj = PyUnicode_FromObject(obj);
5540 if (uniobj == NULL) {
5541 PyErr_SetString(PyExc_TypeError,
5542 "The fill character cannot be converted to Unicode");
5543 return 0;
5544 }
5545 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5546 PyErr_SetString(PyExc_TypeError,
5547 "The fill character must be exactly one character long");
5548 Py_DECREF(uniobj);
5549 return 0;
5550 }
5551 unistr = PyUnicode_AS_UNICODE(uniobj);
5552 *fillcharloc = unistr[0];
5553 Py_DECREF(uniobj);
5554 return 1;
5555}
5556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005557PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005558"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005560Return S centered in a Unicode string of length width. Padding is\n\
5561done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562
5563static PyObject *
5564unicode_center(PyUnicodeObject *self, PyObject *args)
5565{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005566 Py_ssize_t marg, left;
5567 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005568 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569
Thomas Woutersde017742006-02-16 19:34:37 +00005570 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 return NULL;
5572
Tim Peters7a29bd52001-09-12 03:03:31 +00005573 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 Py_INCREF(self);
5575 return (PyObject*) self;
5576 }
5577
5578 marg = width - self->length;
5579 left = marg / 2 + (marg & width & 1);
5580
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005581 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582}
5583
Marc-André Lemburge5034372000-08-08 08:04:29 +00005584#if 0
5585
5586/* This code should go into some future Unicode collation support
5587 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005588 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005589
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005590/* speedy UTF-16 code point order comparison */
5591/* gleaned from: */
5592/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5593
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005594static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005595{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005596 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005597 0, 0, 0, 0, 0, 0, 0, 0,
5598 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005599 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005600};
5601
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602static int
5603unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5604{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005605 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005606
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 Py_UNICODE *s1 = str1->str;
5608 Py_UNICODE *s2 = str2->str;
5609
5610 len1 = str1->length;
5611 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005612
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005614 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005615
5616 c1 = *s1++;
5617 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005618
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005619 if (c1 > (1<<11) * 26)
5620 c1 += utf16Fixup[c1>>11];
5621 if (c2 > (1<<11) * 26)
5622 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005623 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005624
5625 if (c1 != c2)
5626 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005627
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005628 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 }
5630
5631 return (len1 < len2) ? -1 : (len1 != len2);
5632}
5633
Marc-André Lemburge5034372000-08-08 08:04:29 +00005634#else
5635
5636static int
5637unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5638{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005639 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005640
5641 Py_UNICODE *s1 = str1->str;
5642 Py_UNICODE *s2 = str2->str;
5643
5644 len1 = str1->length;
5645 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005646
Marc-André Lemburge5034372000-08-08 08:04:29 +00005647 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005648 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005649
Fredrik Lundh45714e92001-06-26 16:39:36 +00005650 c1 = *s1++;
5651 c2 = *s2++;
5652
5653 if (c1 != c2)
5654 return (c1 < c2) ? -1 : 1;
5655
Marc-André Lemburge5034372000-08-08 08:04:29 +00005656 len1--; len2--;
5657 }
5658
5659 return (len1 < len2) ? -1 : (len1 != len2);
5660}
5661
5662#endif
5663
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664int PyUnicode_Compare(PyObject *left,
5665 PyObject *right)
5666{
5667 PyUnicodeObject *u = NULL, *v = NULL;
5668 int result;
5669
5670 /* Coerce the two arguments */
5671 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5672 if (u == NULL)
5673 goto onError;
5674 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5675 if (v == NULL)
5676 goto onError;
5677
Thomas Wouters7e474022000-07-16 12:04:32 +00005678 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679 if (v == u) {
5680 Py_DECREF(u);
5681 Py_DECREF(v);
5682 return 0;
5683 }
5684
5685 result = unicode_compare(u, v);
5686
5687 Py_DECREF(u);
5688 Py_DECREF(v);
5689 return result;
5690
5691onError:
5692 Py_XDECREF(u);
5693 Py_XDECREF(v);
5694 return -1;
5695}
5696
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00005697PyObject *PyUnicode_RichCompare(PyObject *left,
5698 PyObject *right,
5699 int op)
5700{
5701 int result;
5702
5703 result = PyUnicode_Compare(left, right);
5704 if (result == -1 && PyErr_Occurred())
5705 goto onError;
5706
5707 /* Convert the return value to a Boolean */
5708 switch (op) {
5709 case Py_EQ:
5710 result = (result == 0);
5711 break;
5712 case Py_NE:
5713 result = (result != 0);
5714 break;
5715 case Py_LE:
5716 result = (result <= 0);
5717 break;
5718 case Py_GE:
5719 result = (result >= 0);
5720 break;
5721 case Py_LT:
5722 result = (result == -1);
5723 break;
5724 case Py_GT:
5725 result = (result == 1);
5726 break;
5727 }
5728 return PyBool_FromLong(result);
5729
5730 onError:
5731
5732 /* Standard case
5733
5734 Type errors mean that PyUnicode_FromObject() could not convert
5735 one of the arguments (usually the right hand side) to Unicode,
5736 ie. we can't handle the comparison request. However, it is
5737 possible that the other object knows a comparison method, which
5738 is why we return Py_NotImplemented to give the other object a
5739 chance.
5740
5741 */
5742 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5743 PyErr_Clear();
5744 Py_INCREF(Py_NotImplemented);
5745 return Py_NotImplemented;
5746 }
5747 if (op != Py_EQ && op != Py_NE)
5748 return NULL;
5749
5750 /* Equality comparison.
5751
5752 This is a special case: we silence any PyExc_UnicodeDecodeError
5753 and instead turn it into a PyErr_UnicodeWarning.
5754
5755 */
5756 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5757 return NULL;
5758 PyErr_Clear();
5759 if (PyErr_Warn(PyExc_UnicodeWarning,
5760 (op == Py_EQ) ?
5761 "Unicode equal comparison "
5762 "failed to convert both arguments to Unicode - "
5763 "interpreting them as being unequal" :
5764 "Unicode unequal comparison "
5765 "failed to convert both arguments to Unicode - "
5766 "interpreting them as being unequal"
5767 ) < 0)
5768 return NULL;
5769 result = (op == Py_NE);
5770 return PyBool_FromLong(result);
5771}
5772
Guido van Rossum403d68b2000-03-13 15:55:09 +00005773int PyUnicode_Contains(PyObject *container,
5774 PyObject *element)
5775{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005776 PyObject *str, *sub;
5777 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005778
5779 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005780 sub = PyUnicode_FromObject(element);
5781 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005782 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005783 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005784 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005785 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005786
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005787 str = PyUnicode_FromObject(container);
5788 if (!str) {
5789 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005790 return -1;
5791 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005792
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005793 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005794
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005795 Py_DECREF(str);
5796 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005797
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005798 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005799}
5800
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801/* Concat to string or Unicode object giving a new Unicode object. */
5802
5803PyObject *PyUnicode_Concat(PyObject *left,
5804 PyObject *right)
5805{
5806 PyUnicodeObject *u = NULL, *v = NULL, *w;
5807
5808 /* Coerce the two arguments */
5809 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5810 if (u == NULL)
5811 goto onError;
5812 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5813 if (v == NULL)
5814 goto onError;
5815
5816 /* Shortcuts */
5817 if (v == unicode_empty) {
5818 Py_DECREF(v);
5819 return (PyObject *)u;
5820 }
5821 if (u == unicode_empty) {
5822 Py_DECREF(u);
5823 return (PyObject *)v;
5824 }
5825
5826 /* Concat the two Unicode strings */
5827 w = _PyUnicode_New(u->length + v->length);
5828 if (w == NULL)
5829 goto onError;
5830 Py_UNICODE_COPY(w->str, u->str, u->length);
5831 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5832
5833 Py_DECREF(u);
5834 Py_DECREF(v);
5835 return (PyObject *)w;
5836
5837onError:
5838 Py_XDECREF(u);
5839 Py_XDECREF(v);
5840 return NULL;
5841}
5842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005843PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844"S.count(sub[, start[, end]]) -> int\n\
5845\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005846Return the number of non-overlapping occurrences of substring sub in\n\
5847Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005848interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849
5850static PyObject *
5851unicode_count(PyUnicodeObject *self, PyObject *args)
5852{
5853 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005854 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005855 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 PyObject *result;
5857
Guido van Rossumb8872e62000-05-09 14:14:27 +00005858 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5859 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 return NULL;
5861
5862 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005863 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 if (substring == NULL)
5865 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005866
Fredrik Lundhc8162812006-05-26 19:33:03 +00005867 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005869 result = PyInt_FromSsize_t(
5870 stringlib_count(self->str + start, end - start,
5871 substring->str, substring->length)
5872 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873
5874 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005875
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 return result;
5877}
5878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005879PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005880"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005882Encodes S using the codec registered for encoding. encoding defaults\n\
5883to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005884handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005885a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5886'xmlcharrefreplace' as well as any other name registered with\n\
5887codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888
5889static PyObject *
5890unicode_encode(PyUnicodeObject *self, PyObject *args)
5891{
5892 char *encoding = NULL;
5893 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005894 PyObject *v;
5895
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5897 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005898 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005899 if (v == NULL)
5900 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005901 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5902 PyErr_Format(PyExc_TypeError,
5903 "encoder did not return a string/unicode object "
5904 "(type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +00005905 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005906 Py_DECREF(v);
5907 return NULL;
5908 }
5909 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005910
5911 onError:
5912 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005913}
5914
5915PyDoc_STRVAR(decode__doc__,
5916"S.decode([encoding[,errors]]) -> string or unicode\n\
5917\n\
5918Decodes S using the codec registered for encoding. encoding defaults\n\
5919to the default encoding. errors may be given to set a different error\n\
5920handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5921a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5922as well as any other name registerd with codecs.register_error that is\n\
5923able to handle UnicodeDecodeErrors.");
5924
5925static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005926unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005927{
5928 char *encoding = NULL;
5929 char *errors = NULL;
5930 PyObject *v;
5931
5932 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5933 return NULL;
5934 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005935 if (v == NULL)
5936 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005937 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5938 PyErr_Format(PyExc_TypeError,
5939 "decoder did not return a string/unicode object "
5940 "(type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +00005941 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005942 Py_DECREF(v);
5943 return NULL;
5944 }
5945 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005946
5947 onError:
5948 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949}
5950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005951PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952"S.expandtabs([tabsize]) -> unicode\n\
5953\n\
5954Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005955If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956
5957static PyObject*
5958unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5959{
5960 Py_UNICODE *e;
5961 Py_UNICODE *p;
5962 Py_UNICODE *q;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005963 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 PyUnicodeObject *u;
5965 int tabsize = 8;
5966
5967 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5968 return NULL;
5969
Thomas Wouters7e474022000-07-16 12:04:32 +00005970 /* First pass: determine size of output string */
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005971 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 e = self->str + self->length;
5973 for (p = self->str; p < e; p++)
5974 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005975 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 j += tabsize - (j % tabsize);
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005977 if (old_j > j) {
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00005978 PyErr_SetString(PyExc_OverflowError,
5979 "new string is too long");
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005980 return NULL;
5981 }
5982 old_j = j;
5983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 }
5985 else {
5986 j++;
5987 if (*p == '\n' || *p == '\r') {
5988 i += j;
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00005989 old_j = j = 0;
5990 if (i < 0) {
5991 PyErr_SetString(PyExc_OverflowError,
5992 "new string is too long");
5993 return NULL;
5994 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 }
5996 }
5997
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005998 if ((i + j) < 0) {
5999 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6000 return NULL;
6001 }
6002
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 /* Second pass: create output string and fill it */
6004 u = _PyUnicode_New(i + j);
6005 if (!u)
6006 return NULL;
6007
6008 j = 0;
6009 q = u->str;
6010
6011 for (p = self->str; p < e; p++)
6012 if (*p == '\t') {
6013 if (tabsize > 0) {
6014 i = tabsize - (j % tabsize);
6015 j += i;
6016 while (i--)
6017 *q++ = ' ';
6018 }
6019 }
6020 else {
6021 j++;
6022 *q++ = *p;
6023 if (*p == '\n' || *p == '\r')
6024 j = 0;
6025 }
6026
6027 return (PyObject*) u;
6028}
6029
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006030PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031"S.find(sub [,start [,end]]) -> int\n\
6032\n\
6033Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006034such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035arguments start and end are interpreted as in slice notation.\n\
6036\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006037Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038
6039static PyObject *
6040unicode_find(PyUnicodeObject *self, PyObject *args)
6041{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006042 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006043 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006044 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006045 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046
Guido van Rossumb8872e62000-05-09 14:14:27 +00006047 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6048 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006050 substring = PyUnicode_FromObject(substring);
6051 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 return NULL;
6053
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006054 result = stringlib_find_slice(
6055 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6056 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6057 start, end
6058 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059
6060 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006061
6062 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063}
6064
6065static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006066unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067{
6068 if (index < 0 || index >= self->length) {
6069 PyErr_SetString(PyExc_IndexError, "string index out of range");
6070 return NULL;
6071 }
6072
6073 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6074}
6075
6076static long
6077unicode_hash(PyUnicodeObject *self)
6078{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006079 /* Since Unicode objects compare equal to their ASCII string
6080 counterparts, they should use the individual character values
6081 as basis for their hash value. This is needed to assure that
6082 strings and Unicode objects behave in the same way as
6083 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084
Martin v. Löwis18e16552006-02-15 17:27:45 +00006085 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006086 register Py_UNICODE *p;
6087 register long x;
6088
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 if (self->hash != -1)
6090 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006091 len = PyUnicode_GET_SIZE(self);
6092 p = PyUnicode_AS_UNICODE(self);
6093 x = *p << 7;
6094 while (--len >= 0)
6095 x = (1000003*x) ^ *p++;
6096 x ^= PyUnicode_GET_SIZE(self);
6097 if (x == -1)
6098 x = -2;
6099 self->hash = x;
6100 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101}
6102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006103PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104"S.index(sub [,start [,end]]) -> int\n\
6105\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006106Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107
6108static PyObject *
6109unicode_index(PyUnicodeObject *self, PyObject *args)
6110{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006111 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006112 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006113 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006114 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115
Guido van Rossumb8872e62000-05-09 14:14:27 +00006116 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6117 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006119 substring = PyUnicode_FromObject(substring);
6120 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 return NULL;
6122
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006123 result = stringlib_find_slice(
6124 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6125 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6126 start, end
6127 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128
6129 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006130
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 if (result < 0) {
6132 PyErr_SetString(PyExc_ValueError, "substring not found");
6133 return NULL;
6134 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006135
Martin v. Löwis18e16552006-02-15 17:27:45 +00006136 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137}
6138
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006139PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006140"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006142Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006143at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144
6145static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006146unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147{
6148 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6149 register const Py_UNICODE *e;
6150 int cased;
6151
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152 /* Shortcut for single character strings */
6153 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006154 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006156 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006157 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006158 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006159
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 e = p + PyUnicode_GET_SIZE(self);
6161 cased = 0;
6162 for (; p < e; p++) {
6163 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006164
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006166 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 else if (!cased && Py_UNICODE_ISLOWER(ch))
6168 cased = 1;
6169 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006170 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171}
6172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006173PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006174"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006176Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006177at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178
6179static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006180unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181{
6182 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6183 register const Py_UNICODE *e;
6184 int cased;
6185
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 /* Shortcut for single character strings */
6187 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006188 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006190 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006191 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006192 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006193
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 e = p + PyUnicode_GET_SIZE(self);
6195 cased = 0;
6196 for (; p < e; p++) {
6197 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006198
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006200 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 else if (!cased && Py_UNICODE_ISUPPER(ch))
6202 cased = 1;
6203 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006204 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205}
6206
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006207PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006208"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006210Return True if S is a titlecased string and there is at least one\n\
6211character in S, i.e. upper- and titlecase characters may only\n\
6212follow uncased characters and lowercase characters only cased ones.\n\
6213Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214
6215static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006216unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217{
6218 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6219 register const Py_UNICODE *e;
6220 int cased, previous_is_cased;
6221
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 /* Shortcut for single character strings */
6223 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006224 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6225 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006227 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006228 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006229 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006230
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 e = p + PyUnicode_GET_SIZE(self);
6232 cased = 0;
6233 previous_is_cased = 0;
6234 for (; p < e; p++) {
6235 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006236
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6238 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006239 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 previous_is_cased = 1;
6241 cased = 1;
6242 }
6243 else if (Py_UNICODE_ISLOWER(ch)) {
6244 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006245 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246 previous_is_cased = 1;
6247 cased = 1;
6248 }
6249 else
6250 previous_is_cased = 0;
6251 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006252 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253}
6254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006255PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006256"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006258Return True if all characters in S are whitespace\n\
6259and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260
6261static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006262unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263{
6264 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6265 register const Py_UNICODE *e;
6266
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 /* Shortcut for single character strings */
6268 if (PyUnicode_GET_SIZE(self) == 1 &&
6269 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006270 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006272 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006273 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006274 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006275
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 e = p + PyUnicode_GET_SIZE(self);
6277 for (; p < e; p++) {
6278 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006279 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006281 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282}
6283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006284PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006285"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006286\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006287Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006288and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006289
6290static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006291unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006292{
6293 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6294 register const Py_UNICODE *e;
6295
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006296 /* Shortcut for single character strings */
6297 if (PyUnicode_GET_SIZE(self) == 1 &&
6298 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006299 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006300
6301 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006302 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006303 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006304
6305 e = p + PyUnicode_GET_SIZE(self);
6306 for (; p < e; p++) {
6307 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006308 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006309 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006310 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006311}
6312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006313PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006314"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006315\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006316Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006317and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006318
6319static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006320unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006321{
6322 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6323 register const Py_UNICODE *e;
6324
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006325 /* Shortcut for single character strings */
6326 if (PyUnicode_GET_SIZE(self) == 1 &&
6327 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006328 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006329
6330 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006331 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006332 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006333
6334 e = p + PyUnicode_GET_SIZE(self);
6335 for (; p < e; p++) {
6336 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006337 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006338 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006339 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006340}
6341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006342PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006343"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006345Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006346False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347
6348static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006349unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350{
6351 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6352 register const Py_UNICODE *e;
6353
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 /* Shortcut for single character strings */
6355 if (PyUnicode_GET_SIZE(self) == 1 &&
6356 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006357 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006359 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006360 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006361 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006362
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363 e = p + PyUnicode_GET_SIZE(self);
6364 for (; p < e; p++) {
6365 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006366 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006368 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369}
6370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006371PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006372"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006374Return True if all characters in S are digits\n\
6375and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376
6377static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006378unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379{
6380 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6381 register const Py_UNICODE *e;
6382
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 /* Shortcut for single character strings */
6384 if (PyUnicode_GET_SIZE(self) == 1 &&
6385 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006386 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006388 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006389 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006390 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006391
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392 e = p + PyUnicode_GET_SIZE(self);
6393 for (; p < e; p++) {
6394 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006395 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006397 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398}
6399
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006400PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006401"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006403Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006404False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
6406static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006407unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408{
6409 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6410 register const Py_UNICODE *e;
6411
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 /* Shortcut for single character strings */
6413 if (PyUnicode_GET_SIZE(self) == 1 &&
6414 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006415 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006417 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006418 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006419 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006420
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 e = p + PyUnicode_GET_SIZE(self);
6422 for (; p < e; p++) {
6423 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006424 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006426 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427}
6428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006429PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430"S.join(sequence) -> unicode\n\
6431\n\
6432Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006433sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434
6435static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006436unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006438 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439}
6440
Martin v. Löwis18e16552006-02-15 17:27:45 +00006441static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442unicode_length(PyUnicodeObject *self)
6443{
6444 return self->length;
6445}
6446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006447PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006448"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449\n\
6450Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006451done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452
6453static PyObject *
6454unicode_ljust(PyUnicodeObject *self, PyObject *args)
6455{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006456 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006457 Py_UNICODE fillchar = ' ';
6458
Martin v. Löwis412fb672006-04-13 06:34:32 +00006459 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 return NULL;
6461
Tim Peters7a29bd52001-09-12 03:03:31 +00006462 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 Py_INCREF(self);
6464 return (PyObject*) self;
6465 }
6466
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006467 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468}
6469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006470PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471"S.lower() -> unicode\n\
6472\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006473Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474
6475static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006476unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 return fixup(self, fixlower);
6479}
6480
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006481#define LEFTSTRIP 0
6482#define RIGHTSTRIP 1
6483#define BOTHSTRIP 2
6484
6485/* Arrays indexed by above */
6486static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6487
6488#define STRIPNAME(i) (stripformat[i]+3)
6489
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006490/* externally visible for str.strip(unicode) */
6491PyObject *
6492_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6493{
6494 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006495 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006496 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006497 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6498 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006499
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006500 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6501
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006502 i = 0;
6503 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006504 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6505 i++;
6506 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006507 }
6508
6509 j = len;
6510 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006511 do {
6512 j--;
6513 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6514 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006515 }
6516
6517 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006518 Py_INCREF(self);
6519 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006520 }
6521 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006522 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006523}
6524
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525
6526static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006527do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006529 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006530 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006531
6532 i = 0;
6533 if (striptype != RIGHTSTRIP) {
6534 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6535 i++;
6536 }
6537 }
6538
6539 j = len;
6540 if (striptype != LEFTSTRIP) {
6541 do {
6542 j--;
6543 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6544 j++;
6545 }
6546
6547 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6548 Py_INCREF(self);
6549 return (PyObject*)self;
6550 }
6551 else
6552 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553}
6554
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006555
6556static PyObject *
6557do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6558{
6559 PyObject *sep = NULL;
6560
6561 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6562 return NULL;
6563
6564 if (sep != NULL && sep != Py_None) {
6565 if (PyUnicode_Check(sep))
6566 return _PyUnicode_XStrip(self, striptype, sep);
6567 else if (PyString_Check(sep)) {
6568 PyObject *res;
6569 sep = PyUnicode_FromObject(sep);
6570 if (sep==NULL)
6571 return NULL;
6572 res = _PyUnicode_XStrip(self, striptype, sep);
6573 Py_DECREF(sep);
6574 return res;
6575 }
6576 else {
6577 PyErr_Format(PyExc_TypeError,
6578 "%s arg must be None, unicode or str",
6579 STRIPNAME(striptype));
6580 return NULL;
6581 }
6582 }
6583
6584 return do_strip(self, striptype);
6585}
6586
6587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006588PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006589"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006590\n\
6591Return a copy of the string S with leading and trailing\n\
6592whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006593If chars is given and not None, remove characters in chars instead.\n\
6594If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006595
6596static PyObject *
6597unicode_strip(PyUnicodeObject *self, PyObject *args)
6598{
6599 if (PyTuple_GET_SIZE(args) == 0)
6600 return do_strip(self, BOTHSTRIP); /* Common case */
6601 else
6602 return do_argstrip(self, BOTHSTRIP, args);
6603}
6604
6605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006606PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006607"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006608\n\
6609Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006610If chars is given and not None, remove characters in chars instead.\n\
6611If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006612
6613static PyObject *
6614unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6615{
6616 if (PyTuple_GET_SIZE(args) == 0)
6617 return do_strip(self, LEFTSTRIP); /* Common case */
6618 else
6619 return do_argstrip(self, LEFTSTRIP, args);
6620}
6621
6622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006623PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006624"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006625\n\
6626Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006627If chars is given and not None, remove characters in chars instead.\n\
6628If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006629
6630static PyObject *
6631unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6632{
6633 if (PyTuple_GET_SIZE(args) == 0)
6634 return do_strip(self, RIGHTSTRIP); /* Common case */
6635 else
6636 return do_argstrip(self, RIGHTSTRIP, args);
6637}
6638
6639
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006641unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642{
6643 PyUnicodeObject *u;
6644 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006645 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006646 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647
6648 if (len < 0)
6649 len = 0;
6650
Tim Peters7a29bd52001-09-12 03:03:31 +00006651 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 /* no repeat, return original string */
6653 Py_INCREF(str);
6654 return (PyObject*) str;
6655 }
Tim Peters8f422462000-09-09 06:13:41 +00006656
6657 /* ensure # of chars needed doesn't overflow int and # of bytes
6658 * needed doesn't overflow size_t
6659 */
6660 nchars = len * str->length;
6661 if (len && nchars / len != str->length) {
6662 PyErr_SetString(PyExc_OverflowError,
6663 "repeated string is too long");
6664 return NULL;
6665 }
6666 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6667 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6668 PyErr_SetString(PyExc_OverflowError,
6669 "repeated string is too long");
6670 return NULL;
6671 }
6672 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673 if (!u)
6674 return NULL;
6675
6676 p = u->str;
6677
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006678 if (str->length == 1 && len > 0) {
6679 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006680 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006681 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006682 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006683 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006684 done = str->length;
6685 }
6686 while (done < nchars) {
6687 int n = (done <= nchars-done) ? done : nchars-done;
6688 Py_UNICODE_COPY(p+done, p, n);
6689 done += n;
6690 }
6691 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692
6693 return (PyObject*) u;
6694}
6695
6696PyObject *PyUnicode_Replace(PyObject *obj,
6697 PyObject *subobj,
6698 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006699 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700{
6701 PyObject *self;
6702 PyObject *str1;
6703 PyObject *str2;
6704 PyObject *result;
6705
6706 self = PyUnicode_FromObject(obj);
6707 if (self == NULL)
6708 return NULL;
6709 str1 = PyUnicode_FromObject(subobj);
6710 if (str1 == NULL) {
6711 Py_DECREF(self);
6712 return NULL;
6713 }
6714 str2 = PyUnicode_FromObject(replobj);
6715 if (str2 == NULL) {
6716 Py_DECREF(self);
6717 Py_DECREF(str1);
6718 return NULL;
6719 }
Tim Petersced69f82003-09-16 20:30:58 +00006720 result = replace((PyUnicodeObject *)self,
6721 (PyUnicodeObject *)str1,
6722 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 maxcount);
6724 Py_DECREF(self);
6725 Py_DECREF(str1);
6726 Py_DECREF(str2);
6727 return result;
6728}
6729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006730PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731"S.replace (old, new[, maxsplit]) -> unicode\n\
6732\n\
6733Return a copy of S with all occurrences of substring\n\
6734old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006735given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736
6737static PyObject*
6738unicode_replace(PyUnicodeObject *self, PyObject *args)
6739{
6740 PyUnicodeObject *str1;
6741 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006742 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 PyObject *result;
6744
Martin v. Löwis18e16552006-02-15 17:27:45 +00006745 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746 return NULL;
6747 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6748 if (str1 == NULL)
6749 return NULL;
6750 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006751 if (str2 == NULL) {
6752 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755
6756 result = replace(self, str1, str2, maxcount);
6757
6758 Py_DECREF(str1);
6759 Py_DECREF(str2);
6760 return result;
6761}
6762
6763static
6764PyObject *unicode_repr(PyObject *unicode)
6765{
6766 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6767 PyUnicode_GET_SIZE(unicode),
6768 1);
6769}
6770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006771PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772"S.rfind(sub [,start [,end]]) -> int\n\
6773\n\
6774Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006775such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776arguments start and end are interpreted as in slice notation.\n\
6777\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006778Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779
6780static PyObject *
6781unicode_rfind(PyUnicodeObject *self, PyObject *args)
6782{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006783 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006784 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006785 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006786 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787
Guido van Rossumb8872e62000-05-09 14:14:27 +00006788 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6789 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006791 substring = PyUnicode_FromObject(substring);
6792 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793 return NULL;
6794
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006795 result = stringlib_rfind_slice(
6796 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6797 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6798 start, end
6799 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800
6801 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006802
6803 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804}
6805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006806PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807"S.rindex(sub [,start [,end]]) -> int\n\
6808\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006809Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810
6811static PyObject *
6812unicode_rindex(PyUnicodeObject *self, PyObject *args)
6813{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006814 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006815 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006816 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006817 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818
Guido van Rossumb8872e62000-05-09 14:14:27 +00006819 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6820 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006822 substring = PyUnicode_FromObject(substring);
6823 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 return NULL;
6825
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006826 result = stringlib_rfind_slice(
6827 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6828 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6829 start, end
6830 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831
6832 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006833
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 if (result < 0) {
6835 PyErr_SetString(PyExc_ValueError, "substring not found");
6836 return NULL;
6837 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006838 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839}
6840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006841PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006842"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843\n\
6844Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006845done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846
6847static PyObject *
6848unicode_rjust(PyUnicodeObject *self, PyObject *args)
6849{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006850 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006851 Py_UNICODE fillchar = ' ';
6852
Martin v. Löwis412fb672006-04-13 06:34:32 +00006853 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 return NULL;
6855
Tim Peters7a29bd52001-09-12 03:03:31 +00006856 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 Py_INCREF(self);
6858 return (PyObject*) self;
6859 }
6860
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006861 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862}
6863
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006865unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866{
6867 /* standard clamping */
6868 if (start < 0)
6869 start = 0;
6870 if (end < 0)
6871 end = 0;
6872 if (end > self->length)
6873 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006874 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 /* full slice, return original string */
6876 Py_INCREF(self);
6877 return (PyObject*) self;
6878 }
6879 if (start > end)
6880 start = end;
6881 /* copy slice */
6882 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6883 end - start);
6884}
6885
6886PyObject *PyUnicode_Split(PyObject *s,
6887 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006888 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889{
6890 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006891
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 s = PyUnicode_FromObject(s);
6893 if (s == NULL)
6894 return NULL;
6895 if (sep != NULL) {
6896 sep = PyUnicode_FromObject(sep);
6897 if (sep == NULL) {
6898 Py_DECREF(s);
6899 return NULL;
6900 }
6901 }
6902
6903 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6904
6905 Py_DECREF(s);
6906 Py_XDECREF(sep);
6907 return result;
6908}
6909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006910PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911"S.split([sep [,maxsplit]]) -> list of strings\n\
6912\n\
6913Return a list of the words in S, using sep as the\n\
6914delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006915splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006916any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917
6918static PyObject*
6919unicode_split(PyUnicodeObject *self, PyObject *args)
6920{
6921 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006922 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923
Martin v. Löwis18e16552006-02-15 17:27:45 +00006924 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 return NULL;
6926
6927 if (substring == Py_None)
6928 return split(self, NULL, maxcount);
6929 else if (PyUnicode_Check(substring))
6930 return split(self, (PyUnicodeObject *)substring, maxcount);
6931 else
6932 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6933}
6934
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006935PyObject *
6936PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6937{
6938 PyObject* str_obj;
6939 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006940 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006941
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006942 str_obj = PyUnicode_FromObject(str_in);
6943 if (!str_obj)
6944 return NULL;
6945 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006946 if (!sep_obj) {
6947 Py_DECREF(str_obj);
6948 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006949 }
6950
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006951 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006952 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6953 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6954 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006955
Fredrik Lundhb9479482006-05-26 17:22:38 +00006956 Py_DECREF(sep_obj);
6957 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006958
6959 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006960}
6961
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006962
6963PyObject *
6964PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6965{
6966 PyObject* str_obj;
6967 PyObject* sep_obj;
6968 PyObject* out;
6969
6970 str_obj = PyUnicode_FromObject(str_in);
6971 if (!str_obj)
6972 return NULL;
6973 sep_obj = PyUnicode_FromObject(sep_in);
6974 if (!sep_obj) {
6975 Py_DECREF(str_obj);
6976 return NULL;
6977 }
6978
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006979 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006980 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6981 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6982 );
6983
6984 Py_DECREF(sep_obj);
6985 Py_DECREF(str_obj);
6986
6987 return out;
6988}
6989
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006990PyDoc_STRVAR(partition__doc__,
6991"S.partition(sep) -> (head, sep, tail)\n\
6992\n\
6993Searches for the separator sep in S, and returns the part before it,\n\
6994the separator itself, and the part after it. If the separator is not\n\
6995found, returns S and two empty strings.");
6996
6997static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006998unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006999{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007000 return PyUnicode_Partition((PyObject *)self, separator);
7001}
7002
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007003PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007004"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007005\n\
7006Searches for the separator sep in S, starting at the end of S, and returns\n\
7007the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007008separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007009
7010static PyObject*
7011unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7012{
7013 return PyUnicode_RPartition((PyObject *)self, separator);
7014}
7015
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007016PyObject *PyUnicode_RSplit(PyObject *s,
7017 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007018 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007019{
7020 PyObject *result;
7021
7022 s = PyUnicode_FromObject(s);
7023 if (s == NULL)
7024 return NULL;
7025 if (sep != NULL) {
7026 sep = PyUnicode_FromObject(sep);
7027 if (sep == NULL) {
7028 Py_DECREF(s);
7029 return NULL;
7030 }
7031 }
7032
7033 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7034
7035 Py_DECREF(s);
7036 Py_XDECREF(sep);
7037 return result;
7038}
7039
7040PyDoc_STRVAR(rsplit__doc__,
7041"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7042\n\
7043Return a list of the words in S, using sep as the\n\
7044delimiter string, starting at the end of the string and\n\
7045working to the front. If maxsplit is given, at most maxsplit\n\
7046splits are done. If sep is not specified, any whitespace string\n\
7047is a separator.");
7048
7049static PyObject*
7050unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7051{
7052 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007053 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007054
Martin v. Löwis18e16552006-02-15 17:27:45 +00007055 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007056 return NULL;
7057
7058 if (substring == Py_None)
7059 return rsplit(self, NULL, maxcount);
7060 else if (PyUnicode_Check(substring))
7061 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7062 else
7063 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7064}
7065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007066PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007067"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068\n\
7069Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007070Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007071is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072
7073static PyObject*
7074unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7075{
Guido van Rossum86662912000-04-11 15:38:46 +00007076 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077
Guido van Rossum86662912000-04-11 15:38:46 +00007078 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079 return NULL;
7080
Guido van Rossum86662912000-04-11 15:38:46 +00007081 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082}
7083
7084static
7085PyObject *unicode_str(PyUnicodeObject *self)
7086{
Fred Drakee4315f52000-05-09 19:53:39 +00007087 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088}
7089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007090PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091"S.swapcase() -> unicode\n\
7092\n\
7093Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007094and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095
7096static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007097unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099 return fixup(self, fixswapcase);
7100}
7101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007102PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103"S.translate(table) -> unicode\n\
7104\n\
7105Return a copy of the string S, where all characters have been mapped\n\
7106through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007107Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7108Unmapped characters are left untouched. Characters mapped to None\n\
7109are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110
7111static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007112unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113{
Tim Petersced69f82003-09-16 20:30:58 +00007114 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007116 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 "ignore");
7118}
7119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007120PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121"S.upper() -> unicode\n\
7122\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007123Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124
7125static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007126unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128 return fixup(self, fixupper);
7129}
7130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007131PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132"S.zfill(width) -> unicode\n\
7133\n\
7134Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007135of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136
7137static PyObject *
7138unicode_zfill(PyUnicodeObject *self, PyObject *args)
7139{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007140 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141 PyUnicodeObject *u;
7142
Martin v. Löwis18e16552006-02-15 17:27:45 +00007143 Py_ssize_t width;
7144 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 return NULL;
7146
7147 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007148 if (PyUnicode_CheckExact(self)) {
7149 Py_INCREF(self);
7150 return (PyObject*) self;
7151 }
7152 else
7153 return PyUnicode_FromUnicode(
7154 PyUnicode_AS_UNICODE(self),
7155 PyUnicode_GET_SIZE(self)
7156 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 }
7158
7159 fill = width - self->length;
7160
7161 u = pad(self, fill, 0, '0');
7162
Walter Dörwald068325e2002-04-15 13:36:47 +00007163 if (u == NULL)
7164 return NULL;
7165
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 if (u->str[fill] == '+' || u->str[fill] == '-') {
7167 /* move sign to beginning of string */
7168 u->str[0] = u->str[fill];
7169 u->str[fill] = '0';
7170 }
7171
7172 return (PyObject*) u;
7173}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174
7175#if 0
7176static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007177unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 return PyInt_FromLong(unicode_freelist_size);
7180}
7181#endif
7182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007183PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007184"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007186Return True if S starts with the specified prefix, False otherwise.\n\
7187With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007188With optional end, stop comparing S at that position.\n\
7189prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190
7191static PyObject *
7192unicode_startswith(PyUnicodeObject *self,
7193 PyObject *args)
7194{
Georg Brandl24250812006-06-09 18:45:48 +00007195 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007197 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007198 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007199 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200
Georg Brandl24250812006-06-09 18:45:48 +00007201 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007202 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007204 if (PyTuple_Check(subobj)) {
7205 Py_ssize_t i;
7206 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7207 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7208 PyTuple_GET_ITEM(subobj, i));
7209 if (substring == NULL)
7210 return NULL;
7211 result = tailmatch(self, substring, start, end, -1);
7212 Py_DECREF(substring);
7213 if (result) {
7214 Py_RETURN_TRUE;
7215 }
7216 }
7217 /* nothing matched */
7218 Py_RETURN_FALSE;
7219 }
7220 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007222 return NULL;
7223 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007225 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226}
7227
7228
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007229PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007230"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007232Return True if S ends with the specified suffix, False otherwise.\n\
7233With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007234With optional end, stop comparing S at that position.\n\
7235suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236
7237static PyObject *
7238unicode_endswith(PyUnicodeObject *self,
7239 PyObject *args)
7240{
Georg Brandl24250812006-06-09 18:45:48 +00007241 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007243 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007244 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007245 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246
Georg Brandl24250812006-06-09 18:45:48 +00007247 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7248 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007250 if (PyTuple_Check(subobj)) {
7251 Py_ssize_t i;
7252 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7253 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7254 PyTuple_GET_ITEM(subobj, i));
7255 if (substring == NULL)
7256 return NULL;
7257 result = tailmatch(self, substring, start, end, +1);
7258 Py_DECREF(substring);
7259 if (result) {
7260 Py_RETURN_TRUE;
7261 }
7262 }
7263 Py_RETURN_FALSE;
7264 }
7265 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268
Georg Brandl24250812006-06-09 18:45:48 +00007269 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007271 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272}
7273
7274
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007275
7276static PyObject *
7277unicode_getnewargs(PyUnicodeObject *v)
7278{
7279 return Py_BuildValue("(u#)", v->str, v->length);
7280}
7281
7282
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283static PyMethodDef unicode_methods[] = {
7284
7285 /* Order is according to common usage: often used methods should
7286 appear first, since lookup is done sequentially. */
7287
Georg Brandlecdc0a92006-03-30 12:19:07 +00007288 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007289 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7290 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007291 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007292 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7293 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7294 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7295 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7296 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7297 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7298 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007299 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007300 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7301 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7302 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007303 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007304 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007305/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7306 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7307 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7308 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007309 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007310 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007311 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007312 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007313 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7314 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7315 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7316 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7317 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7318 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7319 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7320 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7321 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7322 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7323 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7324 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7325 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7326 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007327 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007328#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007329 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330#endif
7331
7332#if 0
7333 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007334 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335#endif
7336
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007337 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338 {NULL, NULL}
7339};
7340
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007341static PyObject *
7342unicode_mod(PyObject *v, PyObject *w)
7343{
7344 if (!PyUnicode_Check(v)) {
7345 Py_INCREF(Py_NotImplemented);
7346 return Py_NotImplemented;
7347 }
7348 return PyUnicode_Format(v, w);
7349}
7350
7351static PyNumberMethods unicode_as_number = {
7352 0, /*nb_add*/
7353 0, /*nb_subtract*/
7354 0, /*nb_multiply*/
7355 0, /*nb_divide*/
7356 unicode_mod, /*nb_remainder*/
7357};
7358
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007360 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007361 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007362 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7363 (ssizeargfunc) unicode_getitem, /* sq_item */
7364 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365 0, /* sq_ass_item */
7366 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007367 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368};
7369
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007370static PyObject*
7371unicode_subscript(PyUnicodeObject* self, PyObject* item)
7372{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007373 if (PyIndex_Check(item)) {
7374 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007375 if (i == -1 && PyErr_Occurred())
7376 return NULL;
7377 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007378 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007379 return unicode_getitem(self, i);
7380 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007381 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007382 Py_UNICODE* source_buf;
7383 Py_UNICODE* result_buf;
7384 PyObject* result;
7385
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007386 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007387 &start, &stop, &step, &slicelength) < 0) {
7388 return NULL;
7389 }
7390
7391 if (slicelength <= 0) {
7392 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007393 } else if (start == 0 && step == 1 && slicelength == self->length &&
7394 PyUnicode_CheckExact(self)) {
7395 Py_INCREF(self);
7396 return (PyObject *)self;
7397 } else if (step == 1) {
7398 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007399 } else {
7400 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00007401 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7402 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007403
7404 if (result_buf == NULL)
7405 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007406
7407 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7408 result_buf[i] = source_buf[cur];
7409 }
Tim Petersced69f82003-09-16 20:30:58 +00007410
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007411 result = PyUnicode_FromUnicode(result_buf, slicelength);
7412 PyMem_FREE(result_buf);
7413 return result;
7414 }
7415 } else {
7416 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7417 return NULL;
7418 }
7419}
7420
7421static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007422 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007423 (binaryfunc)unicode_subscript, /* mp_subscript */
7424 (objobjargproc)0, /* mp_ass_subscript */
7425};
7426
Martin v. Löwis18e16552006-02-15 17:27:45 +00007427static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007429 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430 const void **ptr)
7431{
7432 if (index != 0) {
7433 PyErr_SetString(PyExc_SystemError,
7434 "accessing non-existent unicode segment");
7435 return -1;
7436 }
7437 *ptr = (void *) self->str;
7438 return PyUnicode_GET_DATA_SIZE(self);
7439}
7440
Martin v. Löwis18e16552006-02-15 17:27:45 +00007441static Py_ssize_t
7442unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 const void **ptr)
7444{
7445 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007446 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447 return -1;
7448}
7449
7450static int
7451unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007452 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453{
7454 if (lenp)
7455 *lenp = PyUnicode_GET_DATA_SIZE(self);
7456 return 1;
7457}
7458
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007459static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007461 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 const void **ptr)
7463{
7464 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007465
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466 if (index != 0) {
7467 PyErr_SetString(PyExc_SystemError,
7468 "accessing non-existent unicode segment");
7469 return -1;
7470 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007471 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 if (str == NULL)
7473 return -1;
7474 *ptr = (void *) PyString_AS_STRING(str);
7475 return PyString_GET_SIZE(str);
7476}
7477
7478/* Helpers for PyUnicode_Format() */
7479
7480static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007481getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007483 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484 if (argidx < arglen) {
7485 (*p_argidx)++;
7486 if (arglen < 0)
7487 return args;
7488 else
7489 return PyTuple_GetItem(args, argidx);
7490 }
7491 PyErr_SetString(PyExc_TypeError,
7492 "not enough arguments for format string");
7493 return NULL;
7494}
7495
7496#define F_LJUST (1<<0)
7497#define F_SIGN (1<<1)
7498#define F_BLANK (1<<2)
7499#define F_ALT (1<<3)
7500#define F_ZERO (1<<4)
7501
Martin v. Löwis18e16552006-02-15 17:27:45 +00007502static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007503strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007505 register Py_ssize_t i;
7506 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507 for (i = len - 1; i >= 0; i--)
7508 buffer[i] = (Py_UNICODE) charbuffer[i];
7509
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510 return len;
7511}
7512
Neal Norwitzfc76d632006-01-10 06:03:13 +00007513static int
7514doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7515{
Tim Peters15231542006-02-16 01:08:01 +00007516 Py_ssize_t result;
7517
Neal Norwitzfc76d632006-01-10 06:03:13 +00007518 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007519 result = strtounicode(buffer, (char *)buffer);
7520 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007521}
7522
7523static int
7524longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7525{
Tim Peters15231542006-02-16 01:08:01 +00007526 Py_ssize_t result;
7527
Neal Norwitzfc76d632006-01-10 06:03:13 +00007528 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007529 result = strtounicode(buffer, (char *)buffer);
7530 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007531}
7532
Guido van Rossum078151d2002-08-11 04:24:12 +00007533/* XXX To save some code duplication, formatfloat/long/int could have been
7534 shared with stringobject.c, converting from 8-bit to Unicode after the
7535 formatting is done. */
7536
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537static int
7538formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007539 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540 int flags,
7541 int prec,
7542 int type,
7543 PyObject *v)
7544{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007545 /* fmt = '%#.' + `prec` + `type`
7546 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547 char fmt[20];
7548 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007549
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550 x = PyFloat_AsDouble(v);
7551 if (x == -1.0 && PyErr_Occurred())
7552 return -1;
7553 if (prec < 0)
7554 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7556 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007557 /* Worst case length calc to ensure no buffer overrun:
7558
7559 'g' formats:
7560 fmt = %#.<prec>g
7561 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7562 for any double rep.)
7563 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7564
7565 'f' formats:
7566 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7567 len = 1 + 50 + 1 + prec = 52 + prec
7568
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007569 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007570 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007571
7572 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00007573 if (((type == 'g' || type == 'G') &&
7574 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007575 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007576 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007577 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007578 return -1;
7579 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007580 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7581 (flags&F_ALT) ? "#" : "",
7582 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007583 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584}
7585
Tim Peters38fd5b62000-09-21 05:43:11 +00007586static PyObject*
7587formatlong(PyObject *val, int flags, int prec, int type)
7588{
7589 char *buf;
7590 int i, len;
7591 PyObject *str; /* temporary string object. */
7592 PyUnicodeObject *result;
7593
7594 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7595 if (!str)
7596 return NULL;
7597 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007598 if (!result) {
7599 Py_DECREF(str);
7600 return NULL;
7601 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007602 for (i = 0; i < len; i++)
7603 result->str[i] = buf[i];
7604 result->str[len] = 0;
7605 Py_DECREF(str);
7606 return (PyObject*)result;
7607}
7608
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609static int
7610formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007611 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 int flags,
7613 int prec,
7614 int type,
7615 PyObject *v)
7616{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007617 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007618 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7619 * + 1 + 1
7620 * = 24
7621 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007622 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007623 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624 long x;
7625
7626 x = PyInt_AsLong(v);
7627 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007628 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007629 if (x < 0 && type == 'u') {
7630 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007631 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007632 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7633 sign = "-";
7634 else
7635 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007637 prec = 1;
7638
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007639 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7640 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007641 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007642 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007643 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007644 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007645 return -1;
7646 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007647
7648 if ((flags & F_ALT) &&
7649 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007650 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007651 * of issues that cause pain:
7652 * - when 0 is being converted, the C standard leaves off
7653 * the '0x' or '0X', which is inconsistent with other
7654 * %#x/%#X conversions and inconsistent with Python's
7655 * hex() function
7656 * - there are platforms that violate the standard and
7657 * convert 0 with the '0x' or '0X'
7658 * (Metrowerks, Compaq Tru64)
7659 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007660 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007661 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007662 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007663 * We can achieve the desired consistency by inserting our
7664 * own '0x' or '0X' prefix, and substituting %x/%X in place
7665 * of %#x/%#X.
7666 *
7667 * Note that this is the same approach as used in
7668 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007669 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007670 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7671 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007672 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007673 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007674 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7675 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007676 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007677 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007678 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007679 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007680 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007681 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682}
7683
7684static int
7685formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007686 size_t buflen,
7687 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007689 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007690 if (PyUnicode_Check(v)) {
7691 if (PyUnicode_GET_SIZE(v) != 1)
7692 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007696 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007697 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007698 goto onError;
7699 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701
7702 else {
7703 /* Integer input truncated to a character */
7704 long x;
7705 x = PyInt_AsLong(v);
7706 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007707 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007708#ifdef Py_UNICODE_WIDE
7709 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007710 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007711 "%c arg not in range(0x110000) "
7712 "(wide Python build)");
7713 return -1;
7714 }
7715#else
7716 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007717 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007718 "%c arg not in range(0x10000) "
7719 "(narrow Python build)");
7720 return -1;
7721 }
7722#endif
7723 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 }
7725 buf[1] = '\0';
7726 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007727
7728 onError:
7729 PyErr_SetString(PyExc_TypeError,
7730 "%c requires int or char");
7731 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732}
7733
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007734/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7735
7736 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7737 chars are formatted. XXX This is a magic number. Each formatting
7738 routine does bounds checking to ensure no overflow, but a better
7739 solution may be to malloc a buffer of appropriate size for each
7740 format. For now, the current solution is sufficient.
7741*/
7742#define FORMATBUFLEN (size_t)120
7743
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744PyObject *PyUnicode_Format(PyObject *format,
7745 PyObject *args)
7746{
7747 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007748 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749 int args_owned = 0;
7750 PyUnicodeObject *result = NULL;
7751 PyObject *dict = NULL;
7752 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007753
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 if (format == NULL || args == NULL) {
7755 PyErr_BadInternalCall();
7756 return NULL;
7757 }
7758 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007759 if (uformat == NULL)
7760 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761 fmt = PyUnicode_AS_UNICODE(uformat);
7762 fmtcnt = PyUnicode_GET_SIZE(uformat);
7763
7764 reslen = rescnt = fmtcnt + 100;
7765 result = _PyUnicode_New(reslen);
7766 if (result == NULL)
7767 goto onError;
7768 res = PyUnicode_AS_UNICODE(result);
7769
7770 if (PyTuple_Check(args)) {
7771 arglen = PyTuple_Size(args);
7772 argidx = 0;
7773 }
7774 else {
7775 arglen = -1;
7776 argidx = -2;
7777 }
Martin v. Löwis68192102007-07-21 06:55:02 +00007778 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007779 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780 dict = args;
7781
7782 while (--fmtcnt >= 0) {
7783 if (*fmt != '%') {
7784 if (--rescnt < 0) {
7785 rescnt = fmtcnt + 100;
7786 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007787 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007788 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7790 --rescnt;
7791 }
7792 *res++ = *fmt++;
7793 }
7794 else {
7795 /* Got a format specifier */
7796 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007797 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799 Py_UNICODE c = '\0';
7800 Py_UNICODE fill;
7801 PyObject *v = NULL;
7802 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007803 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007805 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007806 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807
7808 fmt++;
7809 if (*fmt == '(') {
7810 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007811 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812 PyObject *key;
7813 int pcount = 1;
7814
7815 if (dict == NULL) {
7816 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007817 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818 goto onError;
7819 }
7820 ++fmt;
7821 --fmtcnt;
7822 keystart = fmt;
7823 /* Skip over balanced parentheses */
7824 while (pcount > 0 && --fmtcnt >= 0) {
7825 if (*fmt == ')')
7826 --pcount;
7827 else if (*fmt == '(')
7828 ++pcount;
7829 fmt++;
7830 }
7831 keylen = fmt - keystart - 1;
7832 if (fmtcnt < 0 || pcount > 0) {
7833 PyErr_SetString(PyExc_ValueError,
7834 "incomplete format key");
7835 goto onError;
7836 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007837#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007838 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 then looked up since Python uses strings to hold
7840 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007841 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 key = PyUnicode_EncodeUTF8(keystart,
7843 keylen,
7844 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007845#else
7846 key = PyUnicode_FromUnicode(keystart, keylen);
7847#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 if (key == NULL)
7849 goto onError;
7850 if (args_owned) {
7851 Py_DECREF(args);
7852 args_owned = 0;
7853 }
7854 args = PyObject_GetItem(dict, key);
7855 Py_DECREF(key);
7856 if (args == NULL) {
7857 goto onError;
7858 }
7859 args_owned = 1;
7860 arglen = -1;
7861 argidx = -2;
7862 }
7863 while (--fmtcnt >= 0) {
7864 switch (c = *fmt++) {
7865 case '-': flags |= F_LJUST; continue;
7866 case '+': flags |= F_SIGN; continue;
7867 case ' ': flags |= F_BLANK; continue;
7868 case '#': flags |= F_ALT; continue;
7869 case '0': flags |= F_ZERO; continue;
7870 }
7871 break;
7872 }
7873 if (c == '*') {
7874 v = getnextarg(args, arglen, &argidx);
7875 if (v == NULL)
7876 goto onError;
7877 if (!PyInt_Check(v)) {
7878 PyErr_SetString(PyExc_TypeError,
7879 "* wants int");
7880 goto onError;
7881 }
7882 width = PyInt_AsLong(v);
7883 if (width < 0) {
7884 flags |= F_LJUST;
7885 width = -width;
7886 }
7887 if (--fmtcnt >= 0)
7888 c = *fmt++;
7889 }
7890 else if (c >= '0' && c <= '9') {
7891 width = c - '0';
7892 while (--fmtcnt >= 0) {
7893 c = *fmt++;
7894 if (c < '0' || c > '9')
7895 break;
7896 if ((width*10) / 10 != width) {
7897 PyErr_SetString(PyExc_ValueError,
7898 "width too big");
7899 goto onError;
7900 }
7901 width = width*10 + (c - '0');
7902 }
7903 }
7904 if (c == '.') {
7905 prec = 0;
7906 if (--fmtcnt >= 0)
7907 c = *fmt++;
7908 if (c == '*') {
7909 v = getnextarg(args, arglen, &argidx);
7910 if (v == NULL)
7911 goto onError;
7912 if (!PyInt_Check(v)) {
7913 PyErr_SetString(PyExc_TypeError,
7914 "* wants int");
7915 goto onError;
7916 }
7917 prec = PyInt_AsLong(v);
7918 if (prec < 0)
7919 prec = 0;
7920 if (--fmtcnt >= 0)
7921 c = *fmt++;
7922 }
7923 else if (c >= '0' && c <= '9') {
7924 prec = c - '0';
7925 while (--fmtcnt >= 0) {
7926 c = Py_CHARMASK(*fmt++);
7927 if (c < '0' || c > '9')
7928 break;
7929 if ((prec*10) / 10 != prec) {
7930 PyErr_SetString(PyExc_ValueError,
7931 "prec too big");
7932 goto onError;
7933 }
7934 prec = prec*10 + (c - '0');
7935 }
7936 }
7937 } /* prec */
7938 if (fmtcnt >= 0) {
7939 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940 if (--fmtcnt >= 0)
7941 c = *fmt++;
7942 }
7943 }
7944 if (fmtcnt < 0) {
7945 PyErr_SetString(PyExc_ValueError,
7946 "incomplete format");
7947 goto onError;
7948 }
7949 if (c != '%') {
7950 v = getnextarg(args, arglen, &argidx);
7951 if (v == NULL)
7952 goto onError;
7953 }
7954 sign = 0;
7955 fill = ' ';
7956 switch (c) {
7957
7958 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007959 pbuf = formatbuf;
7960 /* presume that buffer length is at least 1 */
7961 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962 len = 1;
7963 break;
7964
7965 case 's':
7966 case 'r':
7967 if (PyUnicode_Check(v) && c == 's') {
7968 temp = v;
7969 Py_INCREF(temp);
7970 }
7971 else {
7972 PyObject *unicode;
7973 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007974 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 else
7976 temp = PyObject_Repr(v);
7977 if (temp == NULL)
7978 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007979 if (PyUnicode_Check(temp))
7980 /* nothing to do */;
7981 else if (PyString_Check(temp)) {
7982 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007983 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007985 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007987 Py_DECREF(temp);
7988 temp = unicode;
7989 if (temp == NULL)
7990 goto onError;
7991 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007992 else {
7993 Py_DECREF(temp);
7994 PyErr_SetString(PyExc_TypeError,
7995 "%s argument has non-string str()");
7996 goto onError;
7997 }
7998 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007999 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 len = PyUnicode_GET_SIZE(temp);
8001 if (prec >= 0 && len > prec)
8002 len = prec;
8003 break;
8004
8005 case 'i':
8006 case 'd':
8007 case 'u':
8008 case 'o':
8009 case 'x':
8010 case 'X':
8011 if (c == 'i')
8012 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008013 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008014 temp = formatlong(v, flags, prec, c);
8015 if (!temp)
8016 goto onError;
8017 pbuf = PyUnicode_AS_UNICODE(temp);
8018 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008019 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008021 else {
8022 pbuf = formatbuf;
8023 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8024 flags, prec, c, v);
8025 if (len < 0)
8026 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008027 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008028 }
8029 if (flags & F_ZERO)
8030 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 break;
8032
8033 case 'e':
8034 case 'E':
8035 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008036 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037 case 'g':
8038 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008039 if (c == 'F')
8040 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008041 pbuf = formatbuf;
8042 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8043 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044 if (len < 0)
8045 goto onError;
8046 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008047 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 fill = '0';
8049 break;
8050
8051 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008052 pbuf = formatbuf;
8053 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 if (len < 0)
8055 goto onError;
8056 break;
8057
8058 default:
8059 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008060 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008061 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008062 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008063 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008064 (Py_ssize_t)(fmt - 1 -
8065 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066 goto onError;
8067 }
8068 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008069 if (*pbuf == '-' || *pbuf == '+') {
8070 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 len--;
8072 }
8073 else if (flags & F_SIGN)
8074 sign = '+';
8075 else if (flags & F_BLANK)
8076 sign = ' ';
8077 else
8078 sign = 0;
8079 }
8080 if (width < len)
8081 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008082 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 reslen -= rescnt;
8084 rescnt = width + fmtcnt + 100;
8085 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008086 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008087 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008088 PyErr_NoMemory();
8089 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008090 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008091 if (_PyUnicode_Resize(&result, reslen) < 0) {
8092 Py_XDECREF(temp);
8093 goto onError;
8094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095 res = PyUnicode_AS_UNICODE(result)
8096 + reslen - rescnt;
8097 }
8098 if (sign) {
8099 if (fill != ' ')
8100 *res++ = sign;
8101 rescnt--;
8102 if (width > len)
8103 width--;
8104 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008105 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8106 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008107 assert(pbuf[1] == c);
8108 if (fill != ' ') {
8109 *res++ = *pbuf++;
8110 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008111 }
Tim Petersfff53252001-04-12 18:38:48 +00008112 rescnt -= 2;
8113 width -= 2;
8114 if (width < 0)
8115 width = 0;
8116 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008117 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118 if (width > len && !(flags & F_LJUST)) {
8119 do {
8120 --rescnt;
8121 *res++ = fill;
8122 } while (--width > len);
8123 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008124 if (fill == ' ') {
8125 if (sign)
8126 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008127 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008128 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008129 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008130 *res++ = *pbuf++;
8131 *res++ = *pbuf++;
8132 }
8133 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008134 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 res += len;
8136 rescnt -= len;
8137 while (--width >= len) {
8138 --rescnt;
8139 *res++ = ' ';
8140 }
8141 if (dict && (argidx < arglen) && c != '%') {
8142 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008143 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008144 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 goto onError;
8146 }
8147 Py_XDECREF(temp);
8148 } /* '%' */
8149 } /* until end */
8150 if (argidx < arglen && !dict) {
8151 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008152 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 goto onError;
8154 }
8155
Thomas Woutersa96affe2006-03-12 00:29:36 +00008156 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8157 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 if (args_owned) {
8159 Py_DECREF(args);
8160 }
8161 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 return (PyObject *)result;
8163
8164 onError:
8165 Py_XDECREF(result);
8166 Py_DECREF(uformat);
8167 if (args_owned) {
8168 Py_DECREF(args);
8169 }
8170 return NULL;
8171}
8172
8173static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008174 (readbufferproc) unicode_buffer_getreadbuf,
8175 (writebufferproc) unicode_buffer_getwritebuf,
8176 (segcountproc) unicode_buffer_getsegcount,
8177 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178};
8179
Jeremy Hylton938ace62002-07-17 16:30:39 +00008180static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008181unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8182
Tim Peters6d6c1a32001-08-02 04:15:00 +00008183static PyObject *
8184unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8185{
8186 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008187 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008188 char *encoding = NULL;
8189 char *errors = NULL;
8190
Guido van Rossume023fe02001-08-30 03:12:59 +00008191 if (type != &PyUnicode_Type)
8192 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008193 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8194 kwlist, &x, &encoding, &errors))
8195 return NULL;
8196 if (x == NULL)
8197 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008198 if (encoding == NULL && errors == NULL)
8199 return PyObject_Unicode(x);
8200 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008201 return PyUnicode_FromEncodedObject(x, encoding, errors);
8202}
8203
Guido van Rossume023fe02001-08-30 03:12:59 +00008204static PyObject *
8205unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8206{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008207 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008208 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008209
8210 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8211 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8212 if (tmp == NULL)
8213 return NULL;
8214 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008215 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008216 if (pnew == NULL) {
8217 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008218 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008219 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008220 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8221 if (pnew->str == NULL) {
8222 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008223 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008224 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008225 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008226 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008227 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8228 pnew->length = n;
8229 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008230 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008231 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008232}
8233
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008234PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008235"unicode(string [, encoding[, errors]]) -> object\n\
8236\n\
8237Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008238encoding defaults to the current default string encoding.\n\
8239errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008240
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008242 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 "unicode", /* tp_name */
8244 sizeof(PyUnicodeObject), /* tp_size */
8245 0, /* tp_itemsize */
8246 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008247 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008249 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008251 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008252 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008253 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008255 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256 (hashfunc) unicode_hash, /* tp_hash*/
8257 0, /* tp_call*/
8258 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008259 PyObject_GenericGetAttr, /* tp_getattro */
8260 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008262 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008263 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008264 unicode_doc, /* tp_doc */
8265 0, /* tp_traverse */
8266 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008267 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008268 0, /* tp_weaklistoffset */
8269 0, /* tp_iter */
8270 0, /* tp_iternext */
8271 unicode_methods, /* tp_methods */
8272 0, /* tp_members */
8273 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008274 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008275 0, /* tp_dict */
8276 0, /* tp_descr_get */
8277 0, /* tp_descr_set */
8278 0, /* tp_dictoffset */
8279 0, /* tp_init */
8280 0, /* tp_alloc */
8281 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008282 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283};
8284
8285/* Initialize the Unicode implementation */
8286
Thomas Wouters78890102000-07-22 19:25:51 +00008287void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008289 int i;
8290
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008291 /* XXX - move this array to unicodectype.c ? */
8292 Py_UNICODE linebreak[] = {
8293 0x000A, /* LINE FEED */
8294 0x000D, /* CARRIAGE RETURN */
8295 0x001C, /* FILE SEPARATOR */
8296 0x001D, /* GROUP SEPARATOR */
8297 0x001E, /* RECORD SEPARATOR */
8298 0x0085, /* NEXT LINE */
8299 0x2028, /* LINE SEPARATOR */
8300 0x2029, /* PARAGRAPH SEPARATOR */
8301 };
8302
Fred Drakee4315f52000-05-09 19:53:39 +00008303 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008304 unicode_freelist = NULL;
8305 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008307 if (!unicode_empty)
8308 return;
8309
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008310 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008311 for (i = 0; i < 256; i++)
8312 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008313 if (PyType_Ready(&PyUnicode_Type) < 0)
8314 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008315
8316 /* initialize the linebreak bloom filter */
8317 bloom_linebreak = make_bloom_mask(
8318 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8319 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008320
8321 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322}
8323
8324/* Finalize the Unicode implementation */
8325
8326void
Thomas Wouters78890102000-07-22 19:25:51 +00008327_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008329 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008330 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008332 Py_XDECREF(unicode_empty);
8333 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008334
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008335 for (i = 0; i < 256; i++) {
8336 if (unicode_latin1[i]) {
8337 Py_DECREF(unicode_latin1[i]);
8338 unicode_latin1[i] = NULL;
8339 }
8340 }
8341
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008342 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343 PyUnicodeObject *v = u;
8344 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008345 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008346 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008347 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008348 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008350 unicode_freelist = NULL;
8351 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008353
Anthony Baxterac6bd462006-04-13 02:06:09 +00008354#ifdef __cplusplus
8355}
8356#endif
8357
8358
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008359/*
8360Local variables:
8361c-basic-offset: 4
8362indent-tabs-mode: nil
8363End:
8364*/