blob: 18b861bb0a6bc99fe513d6a1011cb0036d8a358c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Andrew Dalkee0df7622006-05-27 11:04:36 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Martin v. Löwis68192102007-07-21 06:55:02 +0000315 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis68192102007-07-21 06:55:02 +0000329 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Martin v. Löwis68192102007-07-21 06:55:02 +0000557 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +0000607 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +0000717 Py_Type(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
799 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000800 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000801{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000802 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000803
804 PyObject *restuple = NULL;
805 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000806 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
807 Py_ssize_t requiredsize;
808 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000809 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000810 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000811 int res = -1;
812
813 if (*errorHandler == NULL) {
814 *errorHandler = PyCodec_LookupError(errors);
815 if (*errorHandler == NULL)
816 goto onError;
817 }
818
819 if (*exceptionObject == NULL) {
820 *exceptionObject = PyUnicodeDecodeError_Create(
821 encoding, input, insize, *startinpos, *endinpos, reason);
822 if (*exceptionObject == NULL)
823 goto onError;
824 }
825 else {
826 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
827 goto onError;
828 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
829 goto onError;
830 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
831 goto onError;
832 }
833
834 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
835 if (restuple == NULL)
836 goto onError;
837 if (!PyTuple_Check(restuple)) {
838 PyErr_Format(PyExc_TypeError, &argparse[4]);
839 goto onError;
840 }
841 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
842 goto onError;
843 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000844 newpos = insize+newpos;
845 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000846 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000847 goto onError;
848 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000849
850 /* need more space? (at least enough for what we
851 have+the replacement+the rest of the string (starting
852 at the new input position), so we won't have to check space
853 when there are no errors in the rest of the string) */
854 repptr = PyUnicode_AS_UNICODE(repunicode);
855 repsize = PyUnicode_GET_SIZE(repunicode);
856 requiredsize = *outpos + repsize + insize-newpos;
857 if (requiredsize > outsize) {
858 if (requiredsize<2*outsize)
859 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000860 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000861 goto onError;
862 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
863 }
864 *endinpos = newpos;
865 *inptr = input + newpos;
866 Py_UNICODE_COPY(*outptr, repptr, repsize);
867 *outptr += repsize;
868 *outpos += repsize;
869 /* we made it! */
870 res = 0;
871
872 onError:
873 Py_XDECREF(restuple);
874 return res;
875}
876
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000877/* --- UTF-7 Codec -------------------------------------------------------- */
878
879/* see RFC2152 for details */
880
Tim Petersced69f82003-09-16 20:30:58 +0000881static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000882char utf7_special[128] = {
883 /* indicate whether a UTF-7 character is special i.e. cannot be directly
884 encoded:
885 0 - not special
886 1 - special
887 2 - whitespace (optional)
888 3 - RFC2152 Set O (optional) */
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
891 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
892 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
893 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
894 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
895 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
897
898};
899
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000900/* Note: The comparison (c) <= 0 is a trick to work-around gcc
901 warnings about the comparison always being false; since
902 utf7_special[0] is 1, we can safely make that one comparison
903 true */
904
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000905#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000906 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000907 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 (encodeO && (utf7_special[(c)] == 3)))
909
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000910#define B64(n) \
911 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
912#define B64CHAR(c) \
913 (isalnum(c) || (c) == '+' || (c) == '/')
914#define UB64(c) \
915 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
916 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000917
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000918#define ENCODE(out, ch, bits) \
919 while (bits >= 6) { \
920 *out++ = B64(ch >> (bits-6)); \
921 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000922 }
923
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000924#define DECODE(out, ch, bits, surrogate) \
925 while (bits >= 16) { \
926 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
927 bits -= 16; \
928 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000929 /* We have already generated an error for the high surrogate \
930 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000931 surrogate = 0; \
932 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000933 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000934 it in a 16-bit character */ \
935 surrogate = 1; \
936 errmsg = "code pairs are not supported"; \
937 goto utf7Error; \
938 } else { \
939 *out++ = outCh; \
940 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000941 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000943PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000944 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945 const char *errors)
946{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000947 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
948}
949
950PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
951 Py_ssize_t size,
952 const char *errors,
953 Py_ssize_t *consumed)
954{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000955 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000956 Py_ssize_t startinpos;
957 Py_ssize_t endinpos;
958 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000959 const char *e;
960 PyUnicodeObject *unicode;
961 Py_UNICODE *p;
962 const char *errmsg = "";
963 int inShift = 0;
964 unsigned int bitsleft = 0;
965 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000966 int surrogate = 0;
967 PyObject *errorHandler = NULL;
968 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000969
970 unicode = _PyUnicode_New(size);
971 if (!unicode)
972 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000973 if (size == 0) {
974 if (consumed)
975 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000976 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000977 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000978
979 p = unicode->str;
980 e = s + size;
981
982 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000983 Py_UNICODE ch;
984 restart:
985 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000986
987 if (inShift) {
988 if ((ch == '-') || !B64CHAR(ch)) {
989 inShift = 0;
990 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000991
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000992 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
993 if (bitsleft >= 6) {
994 /* The shift sequence has a partial character in it. If
995 bitsleft < 6 then we could just classify it as padding
996 but that is not the case here */
997
998 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000999 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001000 }
1001 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001002 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001003 here so indicate the potential of a misencoded character. */
1004
1005 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1006 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1007 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001008 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001009 }
1010
1011 if (ch == '-') {
1012 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001013 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001014 inShift = 1;
1015 }
1016 } else if (SPECIAL(ch,0,0)) {
1017 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001018 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 } else {
1020 *p++ = ch;
1021 }
1022 } else {
1023 charsleft = (charsleft << 6) | UB64(ch);
1024 bitsleft += 6;
1025 s++;
1026 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1027 }
1028 }
1029 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001030 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001031 s++;
1032 if (s < e && *s == '-') {
1033 s++;
1034 *p++ = '+';
1035 } else
1036 {
1037 inShift = 1;
1038 bitsleft = 0;
1039 }
1040 }
1041 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001042 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001043 errmsg = "unexpected special character";
1044 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001045 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001046 }
1047 else {
1048 *p++ = ch;
1049 s++;
1050 }
1051 continue;
1052 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001053 outpos = p-PyUnicode_AS_UNICODE(unicode);
1054 endinpos = s-starts;
1055 if (unicode_decode_call_errorhandler(
1056 errors, &errorHandler,
1057 "utf7", errmsg,
1058 starts, size, &startinpos, &endinpos, &exc, &s,
1059 (PyObject **)&unicode, &outpos, &p))
1060 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001063 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001064 outpos = p-PyUnicode_AS_UNICODE(unicode);
1065 endinpos = size;
1066 if (unicode_decode_call_errorhandler(
1067 errors, &errorHandler,
1068 "utf7", "unterminated shift sequence",
1069 starts, size, &startinpos, &endinpos, &exc, &s,
1070 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001072 if (s < e)
1073 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001074 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001075 if (consumed) {
1076 if(inShift)
1077 *consumed = startinpos;
1078 else
1079 *consumed = s-starts;
1080 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001081
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001082 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001083 goto onError;
1084
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001085 Py_XDECREF(errorHandler);
1086 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001087 return (PyObject *)unicode;
1088
1089onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001090 Py_XDECREF(errorHandler);
1091 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001092 Py_DECREF(unicode);
1093 return NULL;
1094}
1095
1096
1097PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001098 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001099 int encodeSetO,
1100 int encodeWhiteSpace,
1101 const char *errors)
1102{
1103 PyObject *v;
1104 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001105 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001106 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001107 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 unsigned int bitsleft = 0;
1109 unsigned long charsleft = 0;
1110 char * out;
1111 char * start;
1112
1113 if (size == 0)
1114 return PyString_FromStringAndSize(NULL, 0);
1115
1116 v = PyString_FromStringAndSize(NULL, cbAllocated);
1117 if (v == NULL)
1118 return NULL;
1119
1120 start = out = PyString_AS_STRING(v);
1121 for (;i < size; ++i) {
1122 Py_UNICODE ch = s[i];
1123
1124 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001125 if (ch == '+') {
1126 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001127 *out++ = '-';
1128 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1129 charsleft = ch;
1130 bitsleft = 16;
1131 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001132 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001133 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001134 } else {
1135 *out++ = (char) ch;
1136 }
1137 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001138 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1139 *out++ = B64(charsleft << (6-bitsleft));
1140 charsleft = 0;
1141 bitsleft = 0;
1142 /* Characters not in the BASE64 set implicitly unshift the sequence
1143 so no '-' is required, except if the character is itself a '-' */
1144 if (B64CHAR(ch) || ch == '-') {
1145 *out++ = '-';
1146 }
1147 inShift = 0;
1148 *out++ = (char) ch;
1149 } else {
1150 bitsleft += 16;
1151 charsleft = (charsleft << 16) | ch;
1152 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1153
1154 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001155 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001156 or '-' then the shift sequence will be terminated implicitly and we
1157 don't have to insert a '-'. */
1158
1159 if (bitsleft == 0) {
1160 if (i + 1 < size) {
1161 Py_UNICODE ch2 = s[i+1];
1162
1163 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001164
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001165 } else if (B64CHAR(ch2) || ch2 == '-') {
1166 *out++ = '-';
1167 inShift = 0;
1168 } else {
1169 inShift = 0;
1170 }
1171
1172 }
1173 else {
1174 *out++ = '-';
1175 inShift = 0;
1176 }
1177 }
Tim Petersced69f82003-09-16 20:30:58 +00001178 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001179 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001180 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001181 if (bitsleft) {
1182 *out++= B64(charsleft << (6-bitsleft) );
1183 *out++ = '-';
1184 }
1185
Tim Peters5de98422002-04-27 18:44:32 +00001186 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001187 return v;
1188}
1189
1190#undef SPECIAL
1191#undef B64
1192#undef B64CHAR
1193#undef UB64
1194#undef ENCODE
1195#undef DECODE
1196
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197/* --- UTF-8 Codec -------------------------------------------------------- */
1198
Tim Petersced69f82003-09-16 20:30:58 +00001199static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200char utf8_code_length[256] = {
1201 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1202 illegal prefix. see RFC 2279 for details */
1203 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1204 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1205 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1206 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1207 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1208 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1209 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1210 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1211 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1212 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1213 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1214 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1215 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1216 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1217 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1218 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1219};
1220
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001222 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 const char *errors)
1224{
Walter Dörwald69652032004-09-07 20:24:22 +00001225 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1226}
1227
1228PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001229 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001230 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001231 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001232{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001233 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001235 Py_ssize_t startinpos;
1236 Py_ssize_t endinpos;
1237 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238 const char *e;
1239 PyUnicodeObject *unicode;
1240 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001241 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001242 PyObject *errorHandler = NULL;
1243 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244
1245 /* Note: size will always be longer than the resulting Unicode
1246 character count */
1247 unicode = _PyUnicode_New(size);
1248 if (!unicode)
1249 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001250 if (size == 0) {
1251 if (consumed)
1252 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255
1256 /* Unpack UTF-8 encoded data */
1257 p = unicode->str;
1258 e = s + size;
1259
1260 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001261 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262
1263 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001264 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001265 s++;
1266 continue;
1267 }
1268
1269 n = utf8_code_length[ch];
1270
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001272 if (consumed)
1273 break;
1274 else {
1275 errmsg = "unexpected end of data";
1276 startinpos = s-starts;
1277 endinpos = size;
1278 goto utf8Error;
1279 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001280 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281
1282 switch (n) {
1283
1284 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
1290 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001291 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001292 startinpos = s-starts;
1293 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001294 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295
1296 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 if ((s[1] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001304 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001305 startinpos = s-starts;
1306 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001307 errmsg = "illegal encoding";
1308 goto utf8Error;
1309 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001311 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 break;
1313
1314 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001315 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 (s[2] & 0xc0) != 0x80) {
1317 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001318 startinpos = s-starts;
1319 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001320 goto utf8Error;
1321 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001323 if (ch < 0x0800) {
1324 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001325 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001326
1327 XXX For wide builds (UCS-4) we should probably try
1328 to recombine the surrogates into a single code
1329 unit.
1330 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001331 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001332 startinpos = s-starts;
1333 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001334 goto utf8Error;
1335 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001336 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001337 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001338 break;
1339
1340 case 4:
1341 if ((s[1] & 0xc0) != 0x80 ||
1342 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001343 (s[3] & 0xc0) != 0x80) {
1344 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001345 startinpos = s-starts;
1346 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001347 goto utf8Error;
1348 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001349 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1350 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1351 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001352 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001353 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001354 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001355 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001357 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001358 startinpos = s-starts;
1359 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001360 goto utf8Error;
1361 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001362#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001363 *p++ = (Py_UNICODE)ch;
1364#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001365 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001366
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001367 /* translate from 10000..10FFFF to 0..FFFF */
1368 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001369
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001370 /* high surrogate = top 10 bits added to D800 */
1371 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001372
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001373 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001374 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001375#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376 break;
1377
1378 default:
1379 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001380 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001381 startinpos = s-starts;
1382 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001383 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384 }
1385 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001386 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001387
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001388 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001389 outpos = p-PyUnicode_AS_UNICODE(unicode);
1390 if (unicode_decode_call_errorhandler(
1391 errors, &errorHandler,
1392 "utf8", errmsg,
1393 starts, size, &startinpos, &endinpos, &exc, &s,
1394 (PyObject **)&unicode, &outpos, &p))
1395 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 }
Walter Dörwald69652032004-09-07 20:24:22 +00001397 if (consumed)
1398 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001399
1400 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001401 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001402 goto onError;
1403
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001404 Py_XDECREF(errorHandler);
1405 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406 return (PyObject *)unicode;
1407
1408onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001409 Py_XDECREF(errorHandler);
1410 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411 Py_DECREF(unicode);
1412 return NULL;
1413}
1414
Tim Peters602f7402002-04-27 18:03:26 +00001415/* Allocation strategy: if the string is short, convert into a stack buffer
1416 and allocate exactly as much space needed at the end. Else allocate the
1417 maximum possible needed (4 result bytes per Unicode character), and return
1418 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001419*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001420PyObject *
1421PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001422 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001423 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424{
Tim Peters602f7402002-04-27 18:03:26 +00001425#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001426
Martin v. Löwis18e16552006-02-15 17:27:45 +00001427 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001428 PyObject *v; /* result string object */
1429 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001430 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001431 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001432 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001433
Tim Peters602f7402002-04-27 18:03:26 +00001434 assert(s != NULL);
1435 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436
Tim Peters602f7402002-04-27 18:03:26 +00001437 if (size <= MAX_SHORT_UNICHARS) {
1438 /* Write into the stack buffer; nallocated can't overflow.
1439 * At the end, we'll allocate exactly as much heap space as it
1440 * turns out we need.
1441 */
1442 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1443 v = NULL; /* will allocate after we're done */
1444 p = stackbuf;
1445 }
1446 else {
1447 /* Overallocate on the heap, and give the excess back at the end. */
1448 nallocated = size * 4;
1449 if (nallocated / 4 != size) /* overflow! */
1450 return PyErr_NoMemory();
1451 v = PyString_FromStringAndSize(NULL, nallocated);
1452 if (v == NULL)
1453 return NULL;
1454 p = PyString_AS_STRING(v);
1455 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001456
Tim Peters602f7402002-04-27 18:03:26 +00001457 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001458 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001459
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001460 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001461 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001463
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001465 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001466 *p++ = (char)(0xc0 | (ch >> 6));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001468 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001469 else {
Tim Peters602f7402002-04-27 18:03:26 +00001470 /* Encode UCS2 Unicode ordinals */
1471 if (ch < 0x10000) {
1472 /* Special case: check for high surrogate */
1473 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1474 Py_UCS4 ch2 = s[i];
1475 /* Check for low surrogate and combine the two to
1476 form a UCS4 value */
1477 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001478 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001479 i++;
1480 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001481 }
Tim Peters602f7402002-04-27 18:03:26 +00001482 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001483 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001484 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001485 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1486 *p++ = (char)(0x80 | (ch & 0x3f));
1487 continue;
1488 }
1489encodeUCS4:
1490 /* Encode UCS4 Unicode ordinals */
1491 *p++ = (char)(0xf0 | (ch >> 18));
1492 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1493 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1494 *p++ = (char)(0x80 | (ch & 0x3f));
1495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001497
Tim Peters602f7402002-04-27 18:03:26 +00001498 if (v == NULL) {
1499 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001500 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001501 assert(nneeded <= nallocated);
1502 v = PyString_FromStringAndSize(stackbuf, nneeded);
1503 }
1504 else {
1505 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001506 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001507 assert(nneeded <= nallocated);
1508 _PyString_Resize(&v, nneeded);
1509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001511
Tim Peters602f7402002-04-27 18:03:26 +00001512#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001513}
1514
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1516{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517 if (!PyUnicode_Check(unicode)) {
1518 PyErr_BadArgument();
1519 return NULL;
1520 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001521 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1522 PyUnicode_GET_SIZE(unicode),
1523 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001524}
1525
Walter Dörwald6e390802007-08-17 16:41:28 +00001526/* --- UTF-32 Codec ------------------------------------------------------- */
1527
1528PyObject *
1529PyUnicode_DecodeUTF32(const char *s,
1530 Py_ssize_t size,
1531 const char *errors,
1532 int *byteorder)
1533{
1534 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
1535}
1536
1537PyObject *
1538PyUnicode_DecodeUTF32Stateful(const char *s,
1539 Py_ssize_t size,
1540 const char *errors,
1541 int *byteorder,
1542 Py_ssize_t *consumed)
1543{
1544 const char *starts = s;
1545 Py_ssize_t startinpos;
1546 Py_ssize_t endinpos;
1547 Py_ssize_t outpos;
1548 PyUnicodeObject *unicode;
1549 Py_UNICODE *p;
1550#ifndef Py_UNICODE_WIDE
1551 int i, pairs;
1552#else
1553 const int pairs = 0;
1554#endif
1555 const unsigned char *q, *e;
1556 int bo = 0; /* assume native ordering by default */
1557 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00001558 /* Offsets from q for retrieving bytes in the right order. */
1559#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1560 int iorder[] = {0, 1, 2, 3};
1561#else
1562 int iorder[] = {3, 2, 1, 0};
1563#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00001564 PyObject *errorHandler = NULL;
1565 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00001566 /* On narrow builds we split characters outside the BMP into two
1567 codepoints => count how much extra space we need. */
1568#ifndef Py_UNICODE_WIDE
1569 for (i = pairs = 0; i < size/4; i++)
1570 if (((Py_UCS4 *)s)[i] >= 0x10000)
1571 pairs++;
1572#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00001573
1574 /* This might be one to much, because of a BOM */
1575 unicode = _PyUnicode_New((size+3)/4+pairs);
1576 if (!unicode)
1577 return NULL;
1578 if (size == 0)
1579 return (PyObject *)unicode;
1580
1581 /* Unpack UTF-32 encoded data */
1582 p = unicode->str;
1583 q = (unsigned char *)s;
1584 e = q + size;
1585
1586 if (byteorder)
1587 bo = *byteorder;
1588
1589 /* Check for BOM marks (U+FEFF) in the input and adjust current
1590 byte order setting accordingly. In native mode, the leading BOM
1591 mark is skipped, in all other modes, it is copied to the output
1592 stream as-is (giving a ZWNBSP character). */
1593 if (bo == 0) {
1594 if (size >= 4) {
1595 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
1596 (q[iorder[1]] << 8) | q[iorder[0]];
1597#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1598 if (bom == 0x0000FEFF) {
1599 q += 4;
1600 bo = -1;
1601 }
1602 else if (bom == 0xFFFE0000) {
1603 q += 4;
1604 bo = 1;
1605 }
1606#else
1607 if (bom == 0x0000FEFF) {
1608 q += 4;
1609 bo = 1;
1610 }
1611 else if (bom == 0xFFFE0000) {
1612 q += 4;
1613 bo = -1;
1614 }
1615#endif
1616 }
1617 }
1618
1619 if (bo == -1) {
1620 /* force LE */
1621 iorder[0] = 0;
1622 iorder[1] = 1;
1623 iorder[2] = 2;
1624 iorder[3] = 3;
1625 }
1626 else if (bo == 1) {
1627 /* force BE */
1628 iorder[0] = 3;
1629 iorder[1] = 2;
1630 iorder[2] = 1;
1631 iorder[3] = 0;
1632 }
1633
1634 while (q < e) {
1635 Py_UCS4 ch;
1636 /* remaining bytes at the end? (size should be divisible by 4) */
1637 if (e-q<4) {
1638 if (consumed)
1639 break;
1640 errmsg = "truncated data";
1641 startinpos = ((const char *)q)-starts;
1642 endinpos = ((const char *)e)-starts;
1643 goto utf32Error;
1644 /* The remaining input chars are ignored if the callback
1645 chooses to skip the input */
1646 }
1647 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
1648 (q[iorder[1]] << 8) | q[iorder[0]];
1649
1650 if (ch >= 0x110000)
1651 {
1652 errmsg = "codepoint not in range(0x110000)";
1653 startinpos = ((const char *)q)-starts;
1654 endinpos = startinpos+4;
1655 goto utf32Error;
1656 }
1657#ifndef Py_UNICODE_WIDE
1658 if (ch >= 0x10000)
1659 {
1660 *p++ = 0xD800 | ((ch-0x10000) >> 10);
1661 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
1662 }
1663 else
1664#endif
1665 *p++ = ch;
1666 q += 4;
1667 continue;
1668 utf32Error:
1669 outpos = p-PyUnicode_AS_UNICODE(unicode);
1670 if (unicode_decode_call_errorhandler(
1671 errors, &errorHandler,
1672 "utf32", errmsg,
1673 starts, size, &startinpos, &endinpos, &exc, &s,
1674 (PyObject **)&unicode, &outpos, &p))
1675 goto onError;
1676 }
1677
1678 if (byteorder)
1679 *byteorder = bo;
1680
1681 if (consumed)
1682 *consumed = (const char *)q-starts;
1683
1684 /* Adjust length */
1685 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1686 goto onError;
1687
1688 Py_XDECREF(errorHandler);
1689 Py_XDECREF(exc);
1690 return (PyObject *)unicode;
1691
1692onError:
1693 Py_DECREF(unicode);
1694 Py_XDECREF(errorHandler);
1695 Py_XDECREF(exc);
1696 return NULL;
1697}
1698
1699PyObject *
1700PyUnicode_EncodeUTF32(const Py_UNICODE *s,
1701 Py_ssize_t size,
1702 const char *errors,
1703 int byteorder)
1704{
1705 PyObject *v;
1706 unsigned char *p;
1707#ifndef Py_UNICODE_WIDE
1708 int i, pairs;
1709#else
1710 const int pairs = 0;
1711#endif
1712 /* Offsets from p for storing byte pairs in the right order. */
1713#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1714 int iorder[] = {0, 1, 2, 3};
1715#else
1716 int iorder[] = {3, 2, 1, 0};
1717#endif
1718
1719#define STORECHAR(CH) \
1720 do { \
1721 p[iorder[3]] = ((CH) >> 24) & 0xff; \
1722 p[iorder[2]] = ((CH) >> 16) & 0xff; \
1723 p[iorder[1]] = ((CH) >> 8) & 0xff; \
1724 p[iorder[0]] = (CH) & 0xff; \
1725 p += 4; \
1726 } while(0)
1727
1728 /* In narrow builds we can output surrogate pairs as one codepoint,
1729 so we need less space. */
1730#ifndef Py_UNICODE_WIDE
1731 for (i = pairs = 0; i < size-1; i++)
1732 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
1733 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
1734 pairs++;
1735#endif
1736 v = PyString_FromStringAndSize(NULL,
1737 4 * (size - pairs + (byteorder == 0)));
1738 if (v == NULL)
1739 return NULL;
1740
1741 p = (unsigned char *)PyString_AS_STRING(v);
1742 if (byteorder == 0)
1743 STORECHAR(0xFEFF);
1744 if (size == 0)
1745 return v;
1746
1747 if (byteorder == -1) {
1748 /* force LE */
1749 iorder[0] = 0;
1750 iorder[1] = 1;
1751 iorder[2] = 2;
1752 iorder[3] = 3;
1753 }
1754 else if (byteorder == 1) {
1755 /* force BE */
1756 iorder[0] = 3;
1757 iorder[1] = 2;
1758 iorder[2] = 1;
1759 iorder[3] = 0;
1760 }
1761
1762 while (size-- > 0) {
1763 Py_UCS4 ch = *s++;
1764#ifndef Py_UNICODE_WIDE
1765 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
1766 Py_UCS4 ch2 = *s;
1767 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1768 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1769 s++;
1770 size--;
1771 }
1772 }
1773#endif
1774 STORECHAR(ch);
1775 }
1776 return v;
1777#undef STORECHAR
1778}
1779
1780PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
1781{
1782 if (!PyUnicode_Check(unicode)) {
1783 PyErr_BadArgument();
1784 return NULL;
1785 }
1786 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
1787 PyUnicode_GET_SIZE(unicode),
1788 NULL,
1789 0);
1790}
1791
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792/* --- UTF-16 Codec ------------------------------------------------------- */
1793
Tim Peters772747b2001-08-09 22:21:55 +00001794PyObject *
1795PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001796 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001797 const char *errors,
1798 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799{
Walter Dörwald69652032004-09-07 20:24:22 +00001800 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1801}
1802
1803PyObject *
1804PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001805 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001806 const char *errors,
1807 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001808 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001809{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001811 Py_ssize_t startinpos;
1812 Py_ssize_t endinpos;
1813 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 PyUnicodeObject *unicode;
1815 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001816 const unsigned char *q, *e;
1817 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001818 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001819 /* Offsets from q for retrieving byte pairs in the right order. */
1820#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1821 int ihi = 1, ilo = 0;
1822#else
1823 int ihi = 0, ilo = 1;
1824#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001825 PyObject *errorHandler = NULL;
1826 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827
1828 /* Note: size will always be longer than the resulting Unicode
1829 character count */
1830 unicode = _PyUnicode_New(size);
1831 if (!unicode)
1832 return NULL;
1833 if (size == 0)
1834 return (PyObject *)unicode;
1835
1836 /* Unpack UTF-16 encoded data */
1837 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001838 q = (unsigned char *)s;
1839 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840
1841 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001842 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001843
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001844 /* Check for BOM marks (U+FEFF) in the input and adjust current
1845 byte order setting accordingly. In native mode, the leading BOM
1846 mark is skipped, in all other modes, it is copied to the output
1847 stream as-is (giving a ZWNBSP character). */
1848 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001849 if (size >= 2) {
1850 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001851#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001852 if (bom == 0xFEFF) {
1853 q += 2;
1854 bo = -1;
1855 }
1856 else if (bom == 0xFFFE) {
1857 q += 2;
1858 bo = 1;
1859 }
Tim Petersced69f82003-09-16 20:30:58 +00001860#else
Walter Dörwald69652032004-09-07 20:24:22 +00001861 if (bom == 0xFEFF) {
1862 q += 2;
1863 bo = 1;
1864 }
1865 else if (bom == 0xFFFE) {
1866 q += 2;
1867 bo = -1;
1868 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001869#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001870 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001871 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872
Tim Peters772747b2001-08-09 22:21:55 +00001873 if (bo == -1) {
1874 /* force LE */
1875 ihi = 1;
1876 ilo = 0;
1877 }
1878 else if (bo == 1) {
1879 /* force BE */
1880 ihi = 0;
1881 ilo = 1;
1882 }
1883
1884 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001885 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001886 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001887 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001888 if (consumed)
1889 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001890 errmsg = "truncated data";
1891 startinpos = ((const char *)q)-starts;
1892 endinpos = ((const char *)e)-starts;
1893 goto utf16Error;
1894 /* The remaining input chars are ignored if the callback
1895 chooses to skip the input */
1896 }
1897 ch = (q[ihi] << 8) | q[ilo];
1898
Tim Peters772747b2001-08-09 22:21:55 +00001899 q += 2;
1900
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901 if (ch < 0xD800 || ch > 0xDFFF) {
1902 *p++ = ch;
1903 continue;
1904 }
1905
1906 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001907 if (q >= e) {
1908 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001909 startinpos = (((const char *)q)-2)-starts;
1910 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001911 goto utf16Error;
1912 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001913 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001914 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1915 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001916 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001917#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001918 *p++ = ch;
1919 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001920#else
1921 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001922#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001923 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001924 }
1925 else {
1926 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001927 startinpos = (((const char *)q)-4)-starts;
1928 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001929 goto utf16Error;
1930 }
1931
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001933 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001934 startinpos = (((const char *)q)-2)-starts;
1935 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001936 /* Fall through to report the error */
1937
1938 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001939 outpos = p-PyUnicode_AS_UNICODE(unicode);
1940 if (unicode_decode_call_errorhandler(
1941 errors, &errorHandler,
1942 "utf16", errmsg,
1943 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1944 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001945 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001946 }
1947
1948 if (byteorder)
1949 *byteorder = bo;
1950
Walter Dörwald69652032004-09-07 20:24:22 +00001951 if (consumed)
1952 *consumed = (const char *)q-starts;
1953
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001955 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956 goto onError;
1957
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001958 Py_XDECREF(errorHandler);
1959 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960 return (PyObject *)unicode;
1961
1962onError:
1963 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001964 Py_XDECREF(errorHandler);
1965 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966 return NULL;
1967}
1968
Tim Peters772747b2001-08-09 22:21:55 +00001969PyObject *
1970PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001971 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001972 const char *errors,
1973 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974{
1975 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001976 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001977#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001978 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001979#else
1980 const int pairs = 0;
1981#endif
Tim Peters772747b2001-08-09 22:21:55 +00001982 /* Offsets from p for storing byte pairs in the right order. */
1983#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1984 int ihi = 1, ilo = 0;
1985#else
1986 int ihi = 0, ilo = 1;
1987#endif
1988
1989#define STORECHAR(CH) \
1990 do { \
1991 p[ihi] = ((CH) >> 8) & 0xff; \
1992 p[ilo] = (CH) & 0xff; \
1993 p += 2; \
1994 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001996#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001997 for (i = pairs = 0; i < size; i++)
1998 if (s[i] >= 0x10000)
1999 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002000#endif
Tim Petersced69f82003-09-16 20:30:58 +00002001 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002002 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003 if (v == NULL)
2004 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005
Tim Peters772747b2001-08-09 22:21:55 +00002006 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002008 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002009 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002010 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002011
2012 if (byteorder == -1) {
2013 /* force LE */
2014 ihi = 1;
2015 ilo = 0;
2016 }
2017 else if (byteorder == 1) {
2018 /* force BE */
2019 ihi = 0;
2020 ilo = 1;
2021 }
2022
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002023 while (size-- > 0) {
2024 Py_UNICODE ch = *s++;
2025 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002026#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002027 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002028 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2029 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002031#endif
Tim Peters772747b2001-08-09 22:21:55 +00002032 STORECHAR(ch);
2033 if (ch2)
2034 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002037#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038}
2039
2040PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2041{
2042 if (!PyUnicode_Check(unicode)) {
2043 PyErr_BadArgument();
2044 return NULL;
2045 }
2046 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2047 PyUnicode_GET_SIZE(unicode),
2048 NULL,
2049 0);
2050}
2051
2052/* --- Unicode Escape Codec ----------------------------------------------- */
2053
Fredrik Lundh06d12682001-01-24 07:59:11 +00002054static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002055
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002057 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 const char *errors)
2059{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002061 Py_ssize_t startinpos;
2062 Py_ssize_t endinpos;
2063 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002068 char* message;
2069 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 PyObject *errorHandler = NULL;
2071 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002072
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 /* Escaped strings will always be longer than the resulting
2074 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002075 length after conversion to the true value.
2076 (but if the error callback returns a long replacement string
2077 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 v = _PyUnicode_New(size);
2079 if (v == NULL)
2080 goto onError;
2081 if (size == 0)
2082 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002083
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002084 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002086
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 while (s < end) {
2088 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002089 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002090 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091
2092 /* Non-escape characters are interpreted as Unicode ordinals */
2093 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002094 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002095 continue;
2096 }
2097
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002098 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 /* \ - Escapes */
2100 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002101 c = *s++;
2102 if (s > end)
2103 c = '\0'; /* Invalid after \ */
2104 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105
2106 /* \x escapes */
2107 case '\n': break;
2108 case '\\': *p++ = '\\'; break;
2109 case '\'': *p++ = '\''; break;
2110 case '\"': *p++ = '\"'; break;
2111 case 'b': *p++ = '\b'; break;
2112 case 'f': *p++ = '\014'; break; /* FF */
2113 case 't': *p++ = '\t'; break;
2114 case 'n': *p++ = '\n'; break;
2115 case 'r': *p++ = '\r'; break;
2116 case 'v': *p++ = '\013'; break; /* VT */
2117 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2118
2119 /* \OOO (octal) escapes */
2120 case '0': case '1': case '2': case '3':
2121 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002122 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002123 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002124 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002125 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002126 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002128 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 break;
2130
Fredrik Lundhccc74732001-02-18 22:13:49 +00002131 /* hex escapes */
2132 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002134 digits = 2;
2135 message = "truncated \\xXX escape";
2136 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137
Fredrik Lundhccc74732001-02-18 22:13:49 +00002138 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002140 digits = 4;
2141 message = "truncated \\uXXXX escape";
2142 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143
Fredrik Lundhccc74732001-02-18 22:13:49 +00002144 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002145 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002146 digits = 8;
2147 message = "truncated \\UXXXXXXXX escape";
2148 hexescape:
2149 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150 outpos = p-PyUnicode_AS_UNICODE(v);
2151 if (s+digits>end) {
2152 endinpos = size;
2153 if (unicode_decode_call_errorhandler(
2154 errors, &errorHandler,
2155 "unicodeescape", "end of string in escape sequence",
2156 starts, size, &startinpos, &endinpos, &exc, &s,
2157 (PyObject **)&v, &outpos, &p))
2158 goto onError;
2159 goto nextByte;
2160 }
2161 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002162 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002163 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002164 endinpos = (s+i+1)-starts;
2165 if (unicode_decode_call_errorhandler(
2166 errors, &errorHandler,
2167 "unicodeescape", message,
2168 starts, size, &startinpos, &endinpos, &exc, &s,
2169 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002170 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002171 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002172 }
2173 chr = (chr<<4) & ~0xF;
2174 if (c >= '0' && c <= '9')
2175 chr += c - '0';
2176 else if (c >= 'a' && c <= 'f')
2177 chr += 10 + c - 'a';
2178 else
2179 chr += 10 + c - 'A';
2180 }
2181 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002182 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002183 /* _decoding_error will have already written into the
2184 target buffer. */
2185 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002186 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002187 /* when we get here, chr is a 32-bit unicode character */
2188 if (chr <= 0xffff)
2189 /* UCS-2 character */
2190 *p++ = (Py_UNICODE) chr;
2191 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002192 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002193 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002194#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002195 *p++ = chr;
2196#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002197 chr -= 0x10000L;
2198 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002199 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002200#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002201 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002202 endinpos = s-starts;
2203 outpos = p-PyUnicode_AS_UNICODE(v);
2204 if (unicode_decode_call_errorhandler(
2205 errors, &errorHandler,
2206 "unicodeescape", "illegal Unicode character",
2207 starts, size, &startinpos, &endinpos, &exc, &s,
2208 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002209 goto onError;
2210 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002211 break;
2212
2213 /* \N{name} */
2214 case 'N':
2215 message = "malformed \\N character escape";
2216 if (ucnhash_CAPI == NULL) {
2217 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002218 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002219 m = PyImport_ImportModule("unicodedata");
2220 if (m == NULL)
2221 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002222 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002223 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002224 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002225 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002226 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002227 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002228 if (ucnhash_CAPI == NULL)
2229 goto ucnhashError;
2230 }
2231 if (*s == '{') {
2232 const char *start = s+1;
2233 /* look for the closing brace */
2234 while (*s != '}' && s < end)
2235 s++;
2236 if (s > start && s < end && *s == '}') {
2237 /* found a name. look it up in the unicode database */
2238 message = "unknown Unicode character name";
2239 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002240 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002241 goto store;
2242 }
2243 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002244 endinpos = s-starts;
2245 outpos = p-PyUnicode_AS_UNICODE(v);
2246 if (unicode_decode_call_errorhandler(
2247 errors, &errorHandler,
2248 "unicodeescape", message,
2249 starts, size, &startinpos, &endinpos, &exc, &s,
2250 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002251 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002252 break;
2253
2254 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002255 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002256 message = "\\ at end of string";
2257 s--;
2258 endinpos = s-starts;
2259 outpos = p-PyUnicode_AS_UNICODE(v);
2260 if (unicode_decode_call_errorhandler(
2261 errors, &errorHandler,
2262 "unicodeescape", message,
2263 starts, size, &startinpos, &endinpos, &exc, &s,
2264 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002265 goto onError;
2266 }
2267 else {
2268 *p++ = '\\';
2269 *p++ = (unsigned char)s[-1];
2270 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002271 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002273 nextByte:
2274 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002275 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002276 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002277 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002278 Py_XDECREF(errorHandler);
2279 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002281
Fredrik Lundhccc74732001-02-18 22:13:49 +00002282ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002283 PyErr_SetString(
2284 PyExc_UnicodeError,
2285 "\\N escapes not supported (can't load unicodedata module)"
2286 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002287 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002288 Py_XDECREF(errorHandler);
2289 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002290 return NULL;
2291
Fredrik Lundhccc74732001-02-18 22:13:49 +00002292onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002294 Py_XDECREF(errorHandler);
2295 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 return NULL;
2297}
2298
2299/* Return a Unicode-Escape string version of the Unicode object.
2300
2301 If quotes is true, the string is enclosed in u"" or u'' quotes as
2302 appropriate.
2303
2304*/
2305
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002306Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002307 Py_ssize_t size,
2308 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002309{
2310 /* like wcschr, but doesn't stop at NULL characters */
2311
2312 while (size-- > 0) {
2313 if (*s == ch)
2314 return s;
2315 s++;
2316 }
2317
2318 return NULL;
2319}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002320
Guido van Rossumd57fd912000-03-10 22:53:23 +00002321static
2322PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002323 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002324 int quotes)
2325{
2326 PyObject *repr;
2327 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002329 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330
Neal Norwitz17753ec2006-08-21 22:21:19 +00002331 /* XXX(nnorwitz): rather than over-allocating, it would be
2332 better to choose a different scheme. Perhaps scan the
2333 first N-chars of the string and allocate based on that size.
2334 */
2335 /* Initial allocation is based on the longest-possible unichr
2336 escape.
2337
2338 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2339 unichr, so in this case it's the longest unichr escape. In
2340 narrow (UTF-16) builds this is five chars per source unichr
2341 since there are two unichrs in the surrogate pair, so in narrow
2342 (UTF-16) builds it's not the longest unichr escape.
2343
2344 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2345 so in the narrow (UTF-16) build case it's the longest unichr
2346 escape.
2347 */
2348
2349 repr = PyString_FromStringAndSize(NULL,
2350 2
2351#ifdef Py_UNICODE_WIDE
2352 + 10*size
2353#else
2354 + 6*size
2355#endif
2356 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357 if (repr == NULL)
2358 return NULL;
2359
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002360 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002361
2362 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002363 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002364 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365 !findchar(s, size, '"')) ? '"' : '\'';
2366 }
2367 while (size-- > 0) {
2368 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002369
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002370 /* Escape quotes and backslashes */
2371 if ((quotes &&
2372 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373 *p++ = '\\';
2374 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002375 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002376 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002377
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002378#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002379 /* Map 21-bit characters to '\U00xxxxxx' */
2380 else if (ch >= 0x10000) {
2381 *p++ = '\\';
2382 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002383 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2384 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2385 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2386 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2387 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2388 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2389 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002390 *p++ = hexdigit[ch & 0x0000000F];
2391 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002392 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002393#else
2394 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002395 else if (ch >= 0xD800 && ch < 0xDC00) {
2396 Py_UNICODE ch2;
2397 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002398
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002399 ch2 = *s++;
2400 size--;
2401 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2402 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2403 *p++ = '\\';
2404 *p++ = 'U';
2405 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2406 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2407 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2408 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2409 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2410 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2411 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2412 *p++ = hexdigit[ucs & 0x0000000F];
2413 continue;
2414 }
2415 /* Fall through: isolated surrogates are copied as-is */
2416 s--;
2417 size++;
2418 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002419#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002420
Guido van Rossumd57fd912000-03-10 22:53:23 +00002421 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002422 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423 *p++ = '\\';
2424 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002425 *p++ = hexdigit[(ch >> 12) & 0x000F];
2426 *p++ = hexdigit[(ch >> 8) & 0x000F];
2427 *p++ = hexdigit[(ch >> 4) & 0x000F];
2428 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002430
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002431 /* Map special whitespace to '\t', \n', '\r' */
2432 else if (ch == '\t') {
2433 *p++ = '\\';
2434 *p++ = 't';
2435 }
2436 else if (ch == '\n') {
2437 *p++ = '\\';
2438 *p++ = 'n';
2439 }
2440 else if (ch == '\r') {
2441 *p++ = '\\';
2442 *p++ = 'r';
2443 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002444
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002445 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002446 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002448 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002449 *p++ = hexdigit[(ch >> 4) & 0x000F];
2450 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002451 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002452
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453 /* Copy everything else as-is */
2454 else
2455 *p++ = (char) ch;
2456 }
2457 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002458 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459
2460 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002461 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462 return repr;
2463}
2464
2465PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002466 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467{
2468 return unicodeescape_string(s, size, 0);
2469}
2470
2471PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2472{
2473 if (!PyUnicode_Check(unicode)) {
2474 PyErr_BadArgument();
2475 return NULL;
2476 }
2477 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2478 PyUnicode_GET_SIZE(unicode));
2479}
2480
2481/* --- Raw Unicode Escape Codec ------------------------------------------- */
2482
2483PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002484 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485 const char *errors)
2486{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002487 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002488 Py_ssize_t startinpos;
2489 Py_ssize_t endinpos;
2490 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002492 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493 const char *end;
2494 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002495 PyObject *errorHandler = NULL;
2496 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002497
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 /* Escaped strings will always be longer than the resulting
2499 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500 length after conversion to the true value. (But decoding error
2501 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502 v = _PyUnicode_New(size);
2503 if (v == NULL)
2504 goto onError;
2505 if (size == 0)
2506 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002507 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508 end = s + size;
2509 while (s < end) {
2510 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002511 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002513 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514
2515 /* Non-escape characters are interpreted as Unicode ordinals */
2516 if (*s != '\\') {
2517 *p++ = (unsigned char)*s++;
2518 continue;
2519 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002520 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521
2522 /* \u-escapes are only interpreted iff the number of leading
2523 backslashes if odd */
2524 bs = s;
2525 for (;s < end;) {
2526 if (*s != '\\')
2527 break;
2528 *p++ = (unsigned char)*s++;
2529 }
2530 if (((s - bs) & 1) == 0 ||
2531 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002532 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533 continue;
2534 }
2535 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002536 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 s++;
2538
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002539 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002540 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002541 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002542 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 endinpos = s-starts;
2545 if (unicode_decode_call_errorhandler(
2546 errors, &errorHandler,
2547 "rawunicodeescape", "truncated \\uXXXX",
2548 starts, size, &startinpos, &endinpos, &exc, &s,
2549 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002551 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552 }
2553 x = (x<<4) & ~0xF;
2554 if (c >= '0' && c <= '9')
2555 x += c - '0';
2556 else if (c >= 'a' && c <= 'f')
2557 x += 10 + c - 'a';
2558 else
2559 x += 10 + c - 'A';
2560 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002561#ifndef Py_UNICODE_WIDE
2562 if (x > 0x10000) {
2563 if (unicode_decode_call_errorhandler(
2564 errors, &errorHandler,
2565 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2566 starts, size, &startinpos, &endinpos, &exc, &s,
2567 (PyObject **)&v, &outpos, &p))
2568 goto onError;
2569 }
2570#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002571 *p++ = x;
2572 nextByte:
2573 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002575 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002576 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002577 Py_XDECREF(errorHandler);
2578 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002580
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581 onError:
2582 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 Py_XDECREF(errorHandler);
2584 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585 return NULL;
2586}
2587
2588PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002589 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002590{
2591 PyObject *repr;
2592 char *p;
2593 char *q;
2594
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002595 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002597#ifdef Py_UNICODE_WIDE
2598 repr = PyString_FromStringAndSize(NULL, 10 * size);
2599#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002601#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 if (repr == NULL)
2603 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002604 if (size == 0)
2605 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002606
2607 p = q = PyString_AS_STRING(repr);
2608 while (size-- > 0) {
2609 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002610#ifdef Py_UNICODE_WIDE
2611 /* Map 32-bit characters to '\Uxxxxxxxx' */
2612 if (ch >= 0x10000) {
2613 *p++ = '\\';
2614 *p++ = 'U';
2615 *p++ = hexdigit[(ch >> 28) & 0xf];
2616 *p++ = hexdigit[(ch >> 24) & 0xf];
2617 *p++ = hexdigit[(ch >> 20) & 0xf];
2618 *p++ = hexdigit[(ch >> 16) & 0xf];
2619 *p++ = hexdigit[(ch >> 12) & 0xf];
2620 *p++ = hexdigit[(ch >> 8) & 0xf];
2621 *p++ = hexdigit[(ch >> 4) & 0xf];
2622 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002623 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002624 else
2625#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 /* Map 16-bit characters to '\uxxxx' */
2627 if (ch >= 256) {
2628 *p++ = '\\';
2629 *p++ = 'u';
2630 *p++ = hexdigit[(ch >> 12) & 0xf];
2631 *p++ = hexdigit[(ch >> 8) & 0xf];
2632 *p++ = hexdigit[(ch >> 4) & 0xf];
2633 *p++ = hexdigit[ch & 15];
2634 }
2635 /* Copy everything else as-is */
2636 else
2637 *p++ = (char) ch;
2638 }
2639 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002640 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641 return repr;
2642}
2643
2644PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2645{
2646 if (!PyUnicode_Check(unicode)) {
2647 PyErr_BadArgument();
2648 return NULL;
2649 }
2650 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2651 PyUnicode_GET_SIZE(unicode));
2652}
2653
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002654/* --- Unicode Internal Codec ------------------------------------------- */
2655
2656PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002657 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002658 const char *errors)
2659{
2660 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002661 Py_ssize_t startinpos;
2662 Py_ssize_t endinpos;
2663 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002664 PyUnicodeObject *v;
2665 Py_UNICODE *p;
2666 const char *end;
2667 const char *reason;
2668 PyObject *errorHandler = NULL;
2669 PyObject *exc = NULL;
2670
Neal Norwitzd43069c2006-01-08 01:12:10 +00002671#ifdef Py_UNICODE_WIDE
2672 Py_UNICODE unimax = PyUnicode_GetMax();
2673#endif
2674
Armin Rigo7ccbca92006-10-04 12:17:45 +00002675 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002676 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2677 if (v == NULL)
2678 goto onError;
2679 if (PyUnicode_GetSize((PyObject *)v) == 0)
2680 return (PyObject *)v;
2681 p = PyUnicode_AS_UNICODE(v);
2682 end = s + size;
2683
2684 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002685 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002686 /* We have to sanity check the raw data, otherwise doom looms for
2687 some malformed UCS-4 data. */
2688 if (
2689 #ifdef Py_UNICODE_WIDE
2690 *p > unimax || *p < 0 ||
2691 #endif
2692 end-s < Py_UNICODE_SIZE
2693 )
2694 {
2695 startinpos = s - starts;
2696 if (end-s < Py_UNICODE_SIZE) {
2697 endinpos = end-starts;
2698 reason = "truncated input";
2699 }
2700 else {
2701 endinpos = s - starts + Py_UNICODE_SIZE;
2702 reason = "illegal code point (> 0x10FFFF)";
2703 }
2704 outpos = p - PyUnicode_AS_UNICODE(v);
2705 if (unicode_decode_call_errorhandler(
2706 errors, &errorHandler,
2707 "unicode_internal", reason,
2708 starts, size, &startinpos, &endinpos, &exc, &s,
2709 (PyObject **)&v, &outpos, &p)) {
2710 goto onError;
2711 }
2712 }
2713 else {
2714 p++;
2715 s += Py_UNICODE_SIZE;
2716 }
2717 }
2718
Martin v. Löwis412fb672006-04-13 06:34:32 +00002719 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002720 goto onError;
2721 Py_XDECREF(errorHandler);
2722 Py_XDECREF(exc);
2723 return (PyObject *)v;
2724
2725 onError:
2726 Py_XDECREF(v);
2727 Py_XDECREF(errorHandler);
2728 Py_XDECREF(exc);
2729 return NULL;
2730}
2731
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732/* --- Latin-1 Codec ------------------------------------------------------ */
2733
2734PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002735 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 const char *errors)
2737{
2738 PyUnicodeObject *v;
2739 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002740
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002742 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002743 Py_UNICODE r = *(unsigned char*)s;
2744 return PyUnicode_FromUnicode(&r, 1);
2745 }
2746
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 v = _PyUnicode_New(size);
2748 if (v == NULL)
2749 goto onError;
2750 if (size == 0)
2751 return (PyObject *)v;
2752 p = PyUnicode_AS_UNICODE(v);
2753 while (size-- > 0)
2754 *p++ = (unsigned char)*s++;
2755 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002756
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 onError:
2758 Py_XDECREF(v);
2759 return NULL;
2760}
2761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762/* create or adjust a UnicodeEncodeError */
2763static void make_encode_exception(PyObject **exceptionObject,
2764 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002765 const Py_UNICODE *unicode, Py_ssize_t size,
2766 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002767 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002769 if (*exceptionObject == NULL) {
2770 *exceptionObject = PyUnicodeEncodeError_Create(
2771 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 }
2773 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002774 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2775 goto onError;
2776 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2777 goto onError;
2778 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2779 goto onError;
2780 return;
2781 onError:
2782 Py_DECREF(*exceptionObject);
2783 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 }
2785}
2786
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002787/* raises a UnicodeEncodeError */
2788static void raise_encode_exception(PyObject **exceptionObject,
2789 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002790 const Py_UNICODE *unicode, Py_ssize_t size,
2791 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002792 const char *reason)
2793{
2794 make_encode_exception(exceptionObject,
2795 encoding, unicode, size, startpos, endpos, reason);
2796 if (*exceptionObject != NULL)
2797 PyCodec_StrictErrors(*exceptionObject);
2798}
2799
2800/* error handling callback helper:
2801 build arguments, call the callback and check the arguments,
2802 put the result into newpos and return the replacement string, which
2803 has to be freed by the caller */
2804static PyObject *unicode_encode_call_errorhandler(const char *errors,
2805 PyObject **errorHandler,
2806 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002807 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2808 Py_ssize_t startpos, Py_ssize_t endpos,
2809 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002810{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002811 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002812
2813 PyObject *restuple;
2814 PyObject *resunicode;
2815
2816 if (*errorHandler == NULL) {
2817 *errorHandler = PyCodec_LookupError(errors);
2818 if (*errorHandler == NULL)
2819 return NULL;
2820 }
2821
2822 make_encode_exception(exceptionObject,
2823 encoding, unicode, size, startpos, endpos, reason);
2824 if (*exceptionObject == NULL)
2825 return NULL;
2826
2827 restuple = PyObject_CallFunctionObjArgs(
2828 *errorHandler, *exceptionObject, NULL);
2829 if (restuple == NULL)
2830 return NULL;
2831 if (!PyTuple_Check(restuple)) {
2832 PyErr_Format(PyExc_TypeError, &argparse[4]);
2833 Py_DECREF(restuple);
2834 return NULL;
2835 }
2836 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2837 &resunicode, newpos)) {
2838 Py_DECREF(restuple);
2839 return NULL;
2840 }
2841 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002842 *newpos = size+*newpos;
2843 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002844 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002845 Py_DECREF(restuple);
2846 return NULL;
2847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002848 Py_INCREF(resunicode);
2849 Py_DECREF(restuple);
2850 return resunicode;
2851}
2852
2853static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002854 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002855 const char *errors,
2856 int limit)
2857{
2858 /* output object */
2859 PyObject *res;
2860 /* pointers to the beginning and end+1 of input */
2861 const Py_UNICODE *startp = p;
2862 const Py_UNICODE *endp = p + size;
2863 /* pointer to the beginning of the unencodable characters */
2864 /* const Py_UNICODE *badp = NULL; */
2865 /* pointer into the output */
2866 char *str;
2867 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002868 Py_ssize_t respos = 0;
2869 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002870 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2871 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002872 PyObject *errorHandler = NULL;
2873 PyObject *exc = NULL;
2874 /* the following variable is used for caching string comparisons
2875 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2876 int known_errorHandler = -1;
2877
2878 /* allocate enough for a simple encoding without
2879 replacements, if we need more, we'll resize */
2880 res = PyString_FromStringAndSize(NULL, size);
2881 if (res == NULL)
2882 goto onError;
2883 if (size == 0)
2884 return res;
2885 str = PyString_AS_STRING(res);
2886 ressize = size;
2887
2888 while (p<endp) {
2889 Py_UNICODE c = *p;
2890
2891 /* can we encode this? */
2892 if (c<limit) {
2893 /* no overflow check, because we know that the space is enough */
2894 *str++ = (char)c;
2895 ++p;
2896 }
2897 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002898 Py_ssize_t unicodepos = p-startp;
2899 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002900 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002901 Py_ssize_t repsize;
2902 Py_ssize_t newpos;
2903 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002904 Py_UNICODE *uni2;
2905 /* startpos for collecting unencodable chars */
2906 const Py_UNICODE *collstart = p;
2907 const Py_UNICODE *collend = p;
2908 /* find all unecodable characters */
2909 while ((collend < endp) && ((*collend)>=limit))
2910 ++collend;
2911 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2912 if (known_errorHandler==-1) {
2913 if ((errors==NULL) || (!strcmp(errors, "strict")))
2914 known_errorHandler = 1;
2915 else if (!strcmp(errors, "replace"))
2916 known_errorHandler = 2;
2917 else if (!strcmp(errors, "ignore"))
2918 known_errorHandler = 3;
2919 else if (!strcmp(errors, "xmlcharrefreplace"))
2920 known_errorHandler = 4;
2921 else
2922 known_errorHandler = 0;
2923 }
2924 switch (known_errorHandler) {
2925 case 1: /* strict */
2926 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2927 goto onError;
2928 case 2: /* replace */
2929 while (collstart++<collend)
2930 *str++ = '?'; /* fall through */
2931 case 3: /* ignore */
2932 p = collend;
2933 break;
2934 case 4: /* xmlcharrefreplace */
2935 respos = str-PyString_AS_STRING(res);
2936 /* determine replacement size (temporarily (mis)uses p) */
2937 for (p = collstart, repsize = 0; p < collend; ++p) {
2938 if (*p<10)
2939 repsize += 2+1+1;
2940 else if (*p<100)
2941 repsize += 2+2+1;
2942 else if (*p<1000)
2943 repsize += 2+3+1;
2944 else if (*p<10000)
2945 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002946#ifndef Py_UNICODE_WIDE
2947 else
2948 repsize += 2+5+1;
2949#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002950 else if (*p<100000)
2951 repsize += 2+5+1;
2952 else if (*p<1000000)
2953 repsize += 2+6+1;
2954 else
2955 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002956#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957 }
2958 requiredsize = respos+repsize+(endp-collend);
2959 if (requiredsize > ressize) {
2960 if (requiredsize<2*ressize)
2961 requiredsize = 2*ressize;
2962 if (_PyString_Resize(&res, requiredsize))
2963 goto onError;
2964 str = PyString_AS_STRING(res) + respos;
2965 ressize = requiredsize;
2966 }
2967 /* generate replacement (temporarily (mis)uses p) */
2968 for (p = collstart; p < collend; ++p) {
2969 str += sprintf(str, "&#%d;", (int)*p);
2970 }
2971 p = collend;
2972 break;
2973 default:
2974 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2975 encoding, reason, startp, size, &exc,
2976 collstart-startp, collend-startp, &newpos);
2977 if (repunicode == NULL)
2978 goto onError;
2979 /* need more space? (at least enough for what we
2980 have+the replacement+the rest of the string, so
2981 we won't have to check space for encodable characters) */
2982 respos = str-PyString_AS_STRING(res);
2983 repsize = PyUnicode_GET_SIZE(repunicode);
2984 requiredsize = respos+repsize+(endp-collend);
2985 if (requiredsize > ressize) {
2986 if (requiredsize<2*ressize)
2987 requiredsize = 2*ressize;
2988 if (_PyString_Resize(&res, requiredsize)) {
2989 Py_DECREF(repunicode);
2990 goto onError;
2991 }
2992 str = PyString_AS_STRING(res) + respos;
2993 ressize = requiredsize;
2994 }
2995 /* check if there is anything unencodable in the replacement
2996 and copy it to the output */
2997 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2998 c = *uni2;
2999 if (c >= limit) {
3000 raise_encode_exception(&exc, encoding, startp, size,
3001 unicodepos, unicodepos+1, reason);
3002 Py_DECREF(repunicode);
3003 goto onError;
3004 }
3005 *str = (char)c;
3006 }
3007 p = startp + newpos;
3008 Py_DECREF(repunicode);
3009 }
3010 }
3011 }
3012 /* Resize if we allocated to much */
3013 respos = str-PyString_AS_STRING(res);
3014 if (respos<ressize)
3015 /* If this falls res will be NULL */
3016 _PyString_Resize(&res, respos);
3017 Py_XDECREF(errorHandler);
3018 Py_XDECREF(exc);
3019 return res;
3020
3021 onError:
3022 Py_XDECREF(res);
3023 Py_XDECREF(errorHandler);
3024 Py_XDECREF(exc);
3025 return NULL;
3026}
3027
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003029 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 const char *errors)
3031{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003032 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033}
3034
3035PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3036{
3037 if (!PyUnicode_Check(unicode)) {
3038 PyErr_BadArgument();
3039 return NULL;
3040 }
3041 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3042 PyUnicode_GET_SIZE(unicode),
3043 NULL);
3044}
3045
3046/* --- 7-bit ASCII Codec -------------------------------------------------- */
3047
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003049 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 const char *errors)
3051{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003052 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 PyUnicodeObject *v;
3054 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003055 Py_ssize_t startinpos;
3056 Py_ssize_t endinpos;
3057 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003058 const char *e;
3059 PyObject *errorHandler = NULL;
3060 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003061
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003063 if (size == 1 && *(unsigned char*)s < 128) {
3064 Py_UNICODE r = *(unsigned char*)s;
3065 return PyUnicode_FromUnicode(&r, 1);
3066 }
Tim Petersced69f82003-09-16 20:30:58 +00003067
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 v = _PyUnicode_New(size);
3069 if (v == NULL)
3070 goto onError;
3071 if (size == 0)
3072 return (PyObject *)v;
3073 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003074 e = s + size;
3075 while (s < e) {
3076 register unsigned char c = (unsigned char)*s;
3077 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 ++s;
3080 }
3081 else {
3082 startinpos = s-starts;
3083 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003084 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003085 if (unicode_decode_call_errorhandler(
3086 errors, &errorHandler,
3087 "ascii", "ordinal not in range(128)",
3088 starts, size, &startinpos, &endinpos, &exc, &s,
3089 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003093 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003094 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003095 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003096 Py_XDECREF(errorHandler);
3097 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003099
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100 onError:
3101 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003102 Py_XDECREF(errorHandler);
3103 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 return NULL;
3105}
3106
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003108 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 const char *errors)
3110{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003111 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112}
3113
3114PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3115{
3116 if (!PyUnicode_Check(unicode)) {
3117 PyErr_BadArgument();
3118 return NULL;
3119 }
3120 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3121 PyUnicode_GET_SIZE(unicode),
3122 NULL);
3123}
3124
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003125#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003126
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003127/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003128
Martin v. Löwisd8251432006-06-14 05:21:04 +00003129#if SIZEOF_INT < SIZEOF_SSIZE_T
3130#define NEED_RETRY
3131#endif
3132
3133/* XXX This code is limited to "true" double-byte encodings, as
3134 a) it assumes an incomplete character consists of a single byte, and
3135 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3136 encodings, see IsDBCSLeadByteEx documentation. */
3137
3138static int is_dbcs_lead_byte(const char *s, int offset)
3139{
3140 const char *curr = s + offset;
3141
3142 if (IsDBCSLeadByte(*curr)) {
3143 const char *prev = CharPrev(s, curr);
3144 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3145 }
3146 return 0;
3147}
3148
3149/*
3150 * Decode MBCS string into unicode object. If 'final' is set, converts
3151 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3152 */
3153static int decode_mbcs(PyUnicodeObject **v,
3154 const char *s, /* MBCS string */
3155 int size, /* sizeof MBCS string */
3156 int final)
3157{
3158 Py_UNICODE *p;
3159 Py_ssize_t n = 0;
3160 int usize = 0;
3161
3162 assert(size >= 0);
3163
3164 /* Skip trailing lead-byte unless 'final' is set */
3165 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3166 --size;
3167
3168 /* First get the size of the result */
3169 if (size > 0) {
3170 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3171 if (usize == 0) {
3172 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3173 return -1;
3174 }
3175 }
3176
3177 if (*v == NULL) {
3178 /* Create unicode object */
3179 *v = _PyUnicode_New(usize);
3180 if (*v == NULL)
3181 return -1;
3182 }
3183 else {
3184 /* Extend unicode object */
3185 n = PyUnicode_GET_SIZE(*v);
3186 if (_PyUnicode_Resize(v, n + usize) < 0)
3187 return -1;
3188 }
3189
3190 /* Do the conversion */
3191 if (size > 0) {
3192 p = PyUnicode_AS_UNICODE(*v) + n;
3193 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3194 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3195 return -1;
3196 }
3197 }
3198
3199 return size;
3200}
3201
3202PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3203 Py_ssize_t size,
3204 const char *errors,
3205 Py_ssize_t *consumed)
3206{
3207 PyUnicodeObject *v = NULL;
3208 int done;
3209
3210 if (consumed)
3211 *consumed = 0;
3212
3213#ifdef NEED_RETRY
3214 retry:
3215 if (size > INT_MAX)
3216 done = decode_mbcs(&v, s, INT_MAX, 0);
3217 else
3218#endif
3219 done = decode_mbcs(&v, s, (int)size, !consumed);
3220
3221 if (done < 0) {
3222 Py_XDECREF(v);
3223 return NULL;
3224 }
3225
3226 if (consumed)
3227 *consumed += done;
3228
3229#ifdef NEED_RETRY
3230 if (size > INT_MAX) {
3231 s += done;
3232 size -= done;
3233 goto retry;
3234 }
3235#endif
3236
3237 return (PyObject *)v;
3238}
3239
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003240PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003241 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003242 const char *errors)
3243{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003244 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3245}
3246
3247/*
3248 * Convert unicode into string object (MBCS).
3249 * Returns 0 if succeed, -1 otherwise.
3250 */
3251static int encode_mbcs(PyObject **repr,
3252 const Py_UNICODE *p, /* unicode */
3253 int size) /* size of unicode */
3254{
3255 int mbcssize = 0;
3256 Py_ssize_t n = 0;
3257
3258 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003259
3260 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003261 if (size > 0) {
3262 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3263 if (mbcssize == 0) {
3264 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3265 return -1;
3266 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003267 }
3268
Martin v. Löwisd8251432006-06-14 05:21:04 +00003269 if (*repr == NULL) {
3270 /* Create string object */
3271 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3272 if (*repr == NULL)
3273 return -1;
3274 }
3275 else {
3276 /* Extend string object */
3277 n = PyString_Size(*repr);
3278 if (_PyString_Resize(repr, n + mbcssize) < 0)
3279 return -1;
3280 }
3281
3282 /* Do the conversion */
3283 if (size > 0) {
3284 char *s = PyString_AS_STRING(*repr) + n;
3285 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3286 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3287 return -1;
3288 }
3289 }
3290
3291 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003292}
3293
3294PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003295 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003296 const char *errors)
3297{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003298 PyObject *repr = NULL;
3299 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003300
Martin v. Löwisd8251432006-06-14 05:21:04 +00003301#ifdef NEED_RETRY
3302 retry:
3303 if (size > INT_MAX)
3304 ret = encode_mbcs(&repr, p, INT_MAX);
3305 else
3306#endif
3307 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003308
Martin v. Löwisd8251432006-06-14 05:21:04 +00003309 if (ret < 0) {
3310 Py_XDECREF(repr);
3311 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003312 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003313
3314#ifdef NEED_RETRY
3315 if (size > INT_MAX) {
3316 p += INT_MAX;
3317 size -= INT_MAX;
3318 goto retry;
3319 }
3320#endif
3321
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003322 return repr;
3323}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003324
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003325PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3326{
3327 if (!PyUnicode_Check(unicode)) {
3328 PyErr_BadArgument();
3329 return NULL;
3330 }
3331 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3332 PyUnicode_GET_SIZE(unicode),
3333 NULL);
3334}
3335
Martin v. Löwisd8251432006-06-14 05:21:04 +00003336#undef NEED_RETRY
3337
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003338#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003339
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340/* --- Character Mapping Codec -------------------------------------------- */
3341
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003343 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344 PyObject *mapping,
3345 const char *errors)
3346{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003347 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003348 Py_ssize_t startinpos;
3349 Py_ssize_t endinpos;
3350 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003351 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 PyUnicodeObject *v;
3353 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003354 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355 PyObject *errorHandler = NULL;
3356 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003357 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003358 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003359
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 /* Default to Latin-1 */
3361 if (mapping == NULL)
3362 return PyUnicode_DecodeLatin1(s, size, errors);
3363
3364 v = _PyUnicode_New(size);
3365 if (v == NULL)
3366 goto onError;
3367 if (size == 0)
3368 return (PyObject *)v;
3369 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003370 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003371 if (PyUnicode_CheckExact(mapping)) {
3372 mapstring = PyUnicode_AS_UNICODE(mapping);
3373 maplen = PyUnicode_GET_SIZE(mapping);
3374 while (s < e) {
3375 unsigned char ch = *s;
3376 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003378 if (ch < maplen)
3379 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003381 if (x == 0xfffe) {
3382 /* undefined mapping */
3383 outpos = p-PyUnicode_AS_UNICODE(v);
3384 startinpos = s-starts;
3385 endinpos = startinpos+1;
3386 if (unicode_decode_call_errorhandler(
3387 errors, &errorHandler,
3388 "charmap", "character maps to <undefined>",
3389 starts, size, &startinpos, &endinpos, &exc, &s,
3390 (PyObject **)&v, &outpos, &p)) {
3391 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003392 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003393 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003394 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003395 *p++ = x;
3396 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003398 }
3399 else {
3400 while (s < e) {
3401 unsigned char ch = *s;
3402 PyObject *w, *x;
3403
3404 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3405 w = PyInt_FromLong((long)ch);
3406 if (w == NULL)
3407 goto onError;
3408 x = PyObject_GetItem(mapping, w);
3409 Py_DECREF(w);
3410 if (x == NULL) {
3411 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3412 /* No mapping found means: mapping is undefined. */
3413 PyErr_Clear();
3414 x = Py_None;
3415 Py_INCREF(x);
3416 } else
3417 goto onError;
3418 }
3419
3420 /* Apply mapping */
3421 if (PyInt_Check(x)) {
3422 long value = PyInt_AS_LONG(x);
3423 if (value < 0 || value > 65535) {
3424 PyErr_SetString(PyExc_TypeError,
3425 "character mapping must be in range(65536)");
3426 Py_DECREF(x);
3427 goto onError;
3428 }
3429 *p++ = (Py_UNICODE)value;
3430 }
3431 else if (x == Py_None) {
3432 /* undefined mapping */
3433 outpos = p-PyUnicode_AS_UNICODE(v);
3434 startinpos = s-starts;
3435 endinpos = startinpos+1;
3436 if (unicode_decode_call_errorhandler(
3437 errors, &errorHandler,
3438 "charmap", "character maps to <undefined>",
3439 starts, size, &startinpos, &endinpos, &exc, &s,
3440 (PyObject **)&v, &outpos, &p)) {
3441 Py_DECREF(x);
3442 goto onError;
3443 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003444 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003445 continue;
3446 }
3447 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003448 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003449
3450 if (targetsize == 1)
3451 /* 1-1 mapping */
3452 *p++ = *PyUnicode_AS_UNICODE(x);
3453
3454 else if (targetsize > 1) {
3455 /* 1-n mapping */
3456 if (targetsize > extrachars) {
3457 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003458 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3459 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003460 (targetsize << 2);
3461 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00003462 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003463 if (_PyUnicode_Resize(&v,
3464 PyUnicode_GET_SIZE(v) + needed) < 0) {
3465 Py_DECREF(x);
3466 goto onError;
3467 }
3468 p = PyUnicode_AS_UNICODE(v) + oldpos;
3469 }
3470 Py_UNICODE_COPY(p,
3471 PyUnicode_AS_UNICODE(x),
3472 targetsize);
3473 p += targetsize;
3474 extrachars -= targetsize;
3475 }
3476 /* 1-0 mapping: skip the character */
3477 }
3478 else {
3479 /* wrong return value */
3480 PyErr_SetString(PyExc_TypeError,
3481 "character mapping must return integer, None or unicode");
3482 Py_DECREF(x);
3483 goto onError;
3484 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003486 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488 }
3489 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003490 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003492 Py_XDECREF(errorHandler);
3493 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003495
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 Py_XDECREF(errorHandler);
3498 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499 Py_XDECREF(v);
3500 return NULL;
3501}
3502
Martin v. Löwis3f767792006-06-04 19:36:28 +00003503/* Charmap encoding: the lookup table */
3504
3505struct encoding_map{
3506 PyObject_HEAD
3507 unsigned char level1[32];
3508 int count2, count3;
3509 unsigned char level23[1];
3510};
3511
3512static PyObject*
3513encoding_map_size(PyObject *obj, PyObject* args)
3514{
3515 struct encoding_map *map = (struct encoding_map*)obj;
3516 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3517 128*map->count3);
3518}
3519
3520static PyMethodDef encoding_map_methods[] = {
3521 {"size", encoding_map_size, METH_NOARGS,
3522 PyDoc_STR("Return the size (in bytes) of this object") },
3523 { 0 }
3524};
3525
3526static void
3527encoding_map_dealloc(PyObject* o)
3528{
3529 PyObject_FREE(o);
3530}
3531
3532static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00003533 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003534 "EncodingMap", /*tp_name*/
3535 sizeof(struct encoding_map), /*tp_basicsize*/
3536 0, /*tp_itemsize*/
3537 /* methods */
3538 encoding_map_dealloc, /*tp_dealloc*/
3539 0, /*tp_print*/
3540 0, /*tp_getattr*/
3541 0, /*tp_setattr*/
3542 0, /*tp_compare*/
3543 0, /*tp_repr*/
3544 0, /*tp_as_number*/
3545 0, /*tp_as_sequence*/
3546 0, /*tp_as_mapping*/
3547 0, /*tp_hash*/
3548 0, /*tp_call*/
3549 0, /*tp_str*/
3550 0, /*tp_getattro*/
3551 0, /*tp_setattro*/
3552 0, /*tp_as_buffer*/
3553 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3554 0, /*tp_doc*/
3555 0, /*tp_traverse*/
3556 0, /*tp_clear*/
3557 0, /*tp_richcompare*/
3558 0, /*tp_weaklistoffset*/
3559 0, /*tp_iter*/
3560 0, /*tp_iternext*/
3561 encoding_map_methods, /*tp_methods*/
3562 0, /*tp_members*/
3563 0, /*tp_getset*/
3564 0, /*tp_base*/
3565 0, /*tp_dict*/
3566 0, /*tp_descr_get*/
3567 0, /*tp_descr_set*/
3568 0, /*tp_dictoffset*/
3569 0, /*tp_init*/
3570 0, /*tp_alloc*/
3571 0, /*tp_new*/
3572 0, /*tp_free*/
3573 0, /*tp_is_gc*/
3574};
3575
3576PyObject*
3577PyUnicode_BuildEncodingMap(PyObject* string)
3578{
3579 Py_UNICODE *decode;
3580 PyObject *result;
3581 struct encoding_map *mresult;
3582 int i;
3583 int need_dict = 0;
3584 unsigned char level1[32];
3585 unsigned char level2[512];
3586 unsigned char *mlevel1, *mlevel2, *mlevel3;
3587 int count2 = 0, count3 = 0;
3588
3589 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3590 PyErr_BadArgument();
3591 return NULL;
3592 }
3593 decode = PyUnicode_AS_UNICODE(string);
3594 memset(level1, 0xFF, sizeof level1);
3595 memset(level2, 0xFF, sizeof level2);
3596
3597 /* If there isn't a one-to-one mapping of NULL to \0,
3598 or if there are non-BMP characters, we need to use
3599 a mapping dictionary. */
3600 if (decode[0] != 0)
3601 need_dict = 1;
3602 for (i = 1; i < 256; i++) {
3603 int l1, l2;
3604 if (decode[i] == 0
3605 #ifdef Py_UNICODE_WIDE
3606 || decode[i] > 0xFFFF
3607 #endif
3608 ) {
3609 need_dict = 1;
3610 break;
3611 }
3612 if (decode[i] == 0xFFFE)
3613 /* unmapped character */
3614 continue;
3615 l1 = decode[i] >> 11;
3616 l2 = decode[i] >> 7;
3617 if (level1[l1] == 0xFF)
3618 level1[l1] = count2++;
3619 if (level2[l2] == 0xFF)
3620 level2[l2] = count3++;
3621 }
3622
3623 if (count2 >= 0xFF || count3 >= 0xFF)
3624 need_dict = 1;
3625
3626 if (need_dict) {
3627 PyObject *result = PyDict_New();
3628 PyObject *key, *value;
3629 if (!result)
3630 return NULL;
3631 for (i = 0; i < 256; i++) {
3632 key = value = NULL;
3633 key = PyInt_FromLong(decode[i]);
3634 value = PyInt_FromLong(i);
3635 if (!key || !value)
3636 goto failed1;
3637 if (PyDict_SetItem(result, key, value) == -1)
3638 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003639 Py_DECREF(key);
3640 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003641 }
3642 return result;
3643 failed1:
3644 Py_XDECREF(key);
3645 Py_XDECREF(value);
3646 Py_DECREF(result);
3647 return NULL;
3648 }
3649
3650 /* Create a three-level trie */
3651 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3652 16*count2 + 128*count3 - 1);
3653 if (!result)
3654 return PyErr_NoMemory();
3655 PyObject_Init(result, &EncodingMapType);
3656 mresult = (struct encoding_map*)result;
3657 mresult->count2 = count2;
3658 mresult->count3 = count3;
3659 mlevel1 = mresult->level1;
3660 mlevel2 = mresult->level23;
3661 mlevel3 = mresult->level23 + 16*count2;
3662 memcpy(mlevel1, level1, 32);
3663 memset(mlevel2, 0xFF, 16*count2);
3664 memset(mlevel3, 0, 128*count3);
3665 count3 = 0;
3666 for (i = 1; i < 256; i++) {
3667 int o1, o2, o3, i2, i3;
3668 if (decode[i] == 0xFFFE)
3669 /* unmapped character */
3670 continue;
3671 o1 = decode[i]>>11;
3672 o2 = (decode[i]>>7) & 0xF;
3673 i2 = 16*mlevel1[o1] + o2;
3674 if (mlevel2[i2] == 0xFF)
3675 mlevel2[i2] = count3++;
3676 o3 = decode[i] & 0x7F;
3677 i3 = 128*mlevel2[i2] + o3;
3678 mlevel3[i3] = i;
3679 }
3680 return result;
3681}
3682
3683static int
3684encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3685{
3686 struct encoding_map *map = (struct encoding_map*)mapping;
3687 int l1 = c>>11;
3688 int l2 = (c>>7) & 0xF;
3689 int l3 = c & 0x7F;
3690 int i;
3691
3692#ifdef Py_UNICODE_WIDE
3693 if (c > 0xFFFF) {
3694 return -1;
3695 }
3696#endif
3697 if (c == 0)
3698 return 0;
3699 /* level 1*/
3700 i = map->level1[l1];
3701 if (i == 0xFF) {
3702 return -1;
3703 }
3704 /* level 2*/
3705 i = map->level23[16*i+l2];
3706 if (i == 0xFF) {
3707 return -1;
3708 }
3709 /* level 3 */
3710 i = map->level23[16*map->count2 + 128*i + l3];
3711 if (i == 0) {
3712 return -1;
3713 }
3714 return i;
3715}
3716
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003717/* Lookup the character ch in the mapping. If the character
3718 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003719 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 PyObject *w = PyInt_FromLong((long)c);
3723 PyObject *x;
3724
3725 if (w == NULL)
3726 return NULL;
3727 x = PyObject_GetItem(mapping, w);
3728 Py_DECREF(w);
3729 if (x == NULL) {
3730 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3731 /* No mapping found means: mapping is undefined. */
3732 PyErr_Clear();
3733 x = Py_None;
3734 Py_INCREF(x);
3735 return x;
3736 } else
3737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003739 else if (x == Py_None)
3740 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003741 else if (PyInt_Check(x)) {
3742 long value = PyInt_AS_LONG(x);
3743 if (value < 0 || value > 255) {
3744 PyErr_SetString(PyExc_TypeError,
3745 "character mapping must be in range(256)");
3746 Py_DECREF(x);
3747 return NULL;
3748 }
3749 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003751 else if (PyString_Check(x))
3752 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 /* wrong return value */
3755 PyErr_SetString(PyExc_TypeError,
3756 "character mapping must return integer, None or str");
3757 Py_DECREF(x);
3758 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759 }
3760}
3761
Martin v. Löwis3f767792006-06-04 19:36:28 +00003762static int
3763charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3764{
3765 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3766 /* exponentially overallocate to minimize reallocations */
3767 if (requiredsize < 2*outsize)
3768 requiredsize = 2*outsize;
3769 if (_PyString_Resize(outobj, requiredsize)) {
3770 return 0;
3771 }
3772 return 1;
3773}
3774
3775typedef enum charmapencode_result {
3776 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3777}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778/* lookup the character, put the result in the output string and adjust
3779 various state variables. Reallocate the output string if not enough
3780 space is available. Return a new reference to the object that
3781 was put in the output buffer, or Py_None, if the mapping was undefined
3782 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003783 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003785charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003786 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003788 PyObject *rep;
3789 char *outstart;
3790 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003791
Martin v. Löwis68192102007-07-21 06:55:02 +00003792 if (Py_Type(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003793 int res = encoding_map_lookup(c, mapping);
3794 Py_ssize_t requiredsize = *outpos+1;
3795 if (res == -1)
3796 return enc_FAILED;
3797 if (outsize<requiredsize)
3798 if (!charmapencode_resize(outobj, outpos, requiredsize))
3799 return enc_EXCEPTION;
3800 outstart = PyString_AS_STRING(*outobj);
3801 outstart[(*outpos)++] = (char)res;
3802 return enc_SUCCESS;
3803 }
3804
3805 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003807 return enc_EXCEPTION;
3808 else if (rep==Py_None) {
3809 Py_DECREF(rep);
3810 return enc_FAILED;
3811 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003813 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003814 if (outsize<requiredsize)
3815 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003817 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003818 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003819 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3821 }
3822 else {
3823 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003824 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3825 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003826 if (outsize<requiredsize)
3827 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003829 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003830 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003831 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003832 memcpy(outstart + *outpos, repchars, repsize);
3833 *outpos += repsize;
3834 }
3835 }
Georg Brandl9f167602006-06-04 21:46:16 +00003836 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003837 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838}
3839
3840/* handle an error in PyUnicode_EncodeCharmap
3841 Return 0 on success, -1 on error */
3842static
3843int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003844 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003845 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003846 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003847 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003848{
3849 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003850 Py_ssize_t repsize;
3851 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003852 Py_UNICODE *uni2;
3853 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003854 Py_ssize_t collstartpos = *inpos;
3855 Py_ssize_t collendpos = *inpos+1;
3856 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857 char *encoding = "charmap";
3858 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003859 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861 /* find all unencodable characters */
3862 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003863 PyObject *rep;
Martin v. Löwis68192102007-07-21 06:55:02 +00003864 if (Py_Type(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003865 int res = encoding_map_lookup(p[collendpos], mapping);
3866 if (res != -1)
3867 break;
3868 ++collendpos;
3869 continue;
3870 }
3871
3872 rep = charmapencode_lookup(p[collendpos], mapping);
3873 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003874 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003875 else if (rep!=Py_None) {
3876 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003877 break;
3878 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003879 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003880 ++collendpos;
3881 }
3882 /* cache callback name lookup
3883 * (if not done yet, i.e. it's the first error) */
3884 if (*known_errorHandler==-1) {
3885 if ((errors==NULL) || (!strcmp(errors, "strict")))
3886 *known_errorHandler = 1;
3887 else if (!strcmp(errors, "replace"))
3888 *known_errorHandler = 2;
3889 else if (!strcmp(errors, "ignore"))
3890 *known_errorHandler = 3;
3891 else if (!strcmp(errors, "xmlcharrefreplace"))
3892 *known_errorHandler = 4;
3893 else
3894 *known_errorHandler = 0;
3895 }
3896 switch (*known_errorHandler) {
3897 case 1: /* strict */
3898 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3899 return -1;
3900 case 2: /* replace */
3901 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3902 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003903 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003904 return -1;
3905 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003906 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003907 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3908 return -1;
3909 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003910 }
3911 /* fall through */
3912 case 3: /* ignore */
3913 *inpos = collendpos;
3914 break;
3915 case 4: /* xmlcharrefreplace */
3916 /* generate replacement (temporarily (mis)uses p) */
3917 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3918 char buffer[2+29+1+1];
3919 char *cp;
3920 sprintf(buffer, "&#%d;", (int)p[collpos]);
3921 for (cp = buffer; *cp; ++cp) {
3922 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003923 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003925 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003926 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3927 return -1;
3928 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 }
3930 }
3931 *inpos = collendpos;
3932 break;
3933 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003934 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 encoding, reason, p, size, exceptionObject,
3936 collstartpos, collendpos, &newpos);
3937 if (repunicode == NULL)
3938 return -1;
3939 /* generate replacement */
3940 repsize = PyUnicode_GET_SIZE(repunicode);
3941 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3942 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003943 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003944 return -1;
3945 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003946 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3949 return -1;
3950 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 }
3952 *inpos = newpos;
3953 Py_DECREF(repunicode);
3954 }
3955 return 0;
3956}
3957
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003959 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 PyObject *mapping,
3961 const char *errors)
3962{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003963 /* output object */
3964 PyObject *res = NULL;
3965 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003966 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003968 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 PyObject *errorHandler = NULL;
3970 PyObject *exc = NULL;
3971 /* the following variable is used for caching string comparisons
3972 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3973 * 3=ignore, 4=xmlcharrefreplace */
3974 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975
3976 /* Default to Latin-1 */
3977 if (mapping == NULL)
3978 return PyUnicode_EncodeLatin1(p, size, errors);
3979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980 /* allocate enough for a simple encoding without
3981 replacements, if we need more, we'll resize */
3982 res = PyString_FromStringAndSize(NULL, size);
3983 if (res == NULL)
3984 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003985 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 while (inpos<size) {
3989 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003990 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3991 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003993 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003994 if (charmap_encoding_error(p, size, &inpos, mapping,
3995 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003996 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003997 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003998 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001 else
4002 /* done with this character => adjust input position */
4003 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004006 /* Resize if we allocated to much */
4007 if (respos<PyString_GET_SIZE(res)) {
4008 if (_PyString_Resize(&res, respos))
4009 goto onError;
4010 }
4011 Py_XDECREF(exc);
4012 Py_XDECREF(errorHandler);
4013 return res;
4014
4015 onError:
4016 Py_XDECREF(res);
4017 Py_XDECREF(exc);
4018 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019 return NULL;
4020}
4021
4022PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4023 PyObject *mapping)
4024{
4025 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4026 PyErr_BadArgument();
4027 return NULL;
4028 }
4029 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4030 PyUnicode_GET_SIZE(unicode),
4031 mapping,
4032 NULL);
4033}
4034
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004035/* create or adjust a UnicodeTranslateError */
4036static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004037 const Py_UNICODE *unicode, Py_ssize_t size,
4038 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 if (*exceptionObject == NULL) {
4042 *exceptionObject = PyUnicodeTranslateError_Create(
4043 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044 }
4045 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004046 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4047 goto onError;
4048 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4049 goto onError;
4050 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4051 goto onError;
4052 return;
4053 onError:
4054 Py_DECREF(*exceptionObject);
4055 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056 }
4057}
4058
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059/* raises a UnicodeTranslateError */
4060static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004061 const Py_UNICODE *unicode, Py_ssize_t size,
4062 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 const char *reason)
4064{
4065 make_translate_exception(exceptionObject,
4066 unicode, size, startpos, endpos, reason);
4067 if (*exceptionObject != NULL)
4068 PyCodec_StrictErrors(*exceptionObject);
4069}
4070
4071/* error handling callback helper:
4072 build arguments, call the callback and check the arguments,
4073 put the result into newpos and return the replacement string, which
4074 has to be freed by the caller */
4075static PyObject *unicode_translate_call_errorhandler(const char *errors,
4076 PyObject **errorHandler,
4077 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004078 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4079 Py_ssize_t startpos, Py_ssize_t endpos,
4080 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004082 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083
Martin v. Löwis412fb672006-04-13 06:34:32 +00004084 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004085 PyObject *restuple;
4086 PyObject *resunicode;
4087
4088 if (*errorHandler == NULL) {
4089 *errorHandler = PyCodec_LookupError(errors);
4090 if (*errorHandler == NULL)
4091 return NULL;
4092 }
4093
4094 make_translate_exception(exceptionObject,
4095 unicode, size, startpos, endpos, reason);
4096 if (*exceptionObject == NULL)
4097 return NULL;
4098
4099 restuple = PyObject_CallFunctionObjArgs(
4100 *errorHandler, *exceptionObject, NULL);
4101 if (restuple == NULL)
4102 return NULL;
4103 if (!PyTuple_Check(restuple)) {
4104 PyErr_Format(PyExc_TypeError, &argparse[4]);
4105 Py_DECREF(restuple);
4106 return NULL;
4107 }
4108 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004109 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 Py_DECREF(restuple);
4111 return NULL;
4112 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004113 if (i_newpos<0)
4114 *newpos = size+i_newpos;
4115 else
4116 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004117 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004118 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004119 Py_DECREF(restuple);
4120 return NULL;
4121 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004122 Py_INCREF(resunicode);
4123 Py_DECREF(restuple);
4124 return resunicode;
4125}
4126
4127/* Lookup the character ch in the mapping and put the result in result,
4128 which must be decrefed by the caller.
4129 Return 0 on success, -1 on error */
4130static
4131int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4132{
4133 PyObject *w = PyInt_FromLong((long)c);
4134 PyObject *x;
4135
4136 if (w == NULL)
4137 return -1;
4138 x = PyObject_GetItem(mapping, w);
4139 Py_DECREF(w);
4140 if (x == NULL) {
4141 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4142 /* No mapping found means: use 1:1 mapping. */
4143 PyErr_Clear();
4144 *result = NULL;
4145 return 0;
4146 } else
4147 return -1;
4148 }
4149 else if (x == Py_None) {
4150 *result = x;
4151 return 0;
4152 }
4153 else if (PyInt_Check(x)) {
4154 long value = PyInt_AS_LONG(x);
4155 long max = PyUnicode_GetMax();
4156 if (value < 0 || value > max) {
4157 PyErr_Format(PyExc_TypeError,
4158 "character mapping must be in range(0x%lx)", max+1);
4159 Py_DECREF(x);
4160 return -1;
4161 }
4162 *result = x;
4163 return 0;
4164 }
4165 else if (PyUnicode_Check(x)) {
4166 *result = x;
4167 return 0;
4168 }
4169 else {
4170 /* wrong return value */
4171 PyErr_SetString(PyExc_TypeError,
4172 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004173 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174 return -1;
4175 }
4176}
4177/* ensure that *outobj is at least requiredsize characters long,
4178if not reallocate and adjust various state variables.
4179Return 0 on success, -1 on error */
4180static
Walter Dörwald4894c302003-10-24 14:25:28 +00004181int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004182 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004184 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004185 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004187 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004189 if (requiredsize < 2 * oldsize)
4190 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004191 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 return -1;
4193 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 }
4195 return 0;
4196}
4197/* lookup the character, put the result in the output string and adjust
4198 various state variables. Return a new reference to the object that
4199 was put in the output buffer in *result, or Py_None, if the mapping was
4200 undefined (in which case no character was written).
4201 The called must decref result.
4202 Return 0 on success, -1 on error. */
4203static
Walter Dörwald4894c302003-10-24 14:25:28 +00004204int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004205 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004206 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207{
Walter Dörwald4894c302003-10-24 14:25:28 +00004208 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209 return -1;
4210 if (*res==NULL) {
4211 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004212 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 }
4214 else if (*res==Py_None)
4215 ;
4216 else if (PyInt_Check(*res)) {
4217 /* no overflow check, because we know that the space is enough */
4218 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4219 }
4220 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004221 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004222 if (repsize==1) {
4223 /* no overflow check, because we know that the space is enough */
4224 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4225 }
4226 else if (repsize!=0) {
4227 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004228 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004229 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004230 repsize - 1;
4231 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 return -1;
4233 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4234 *outp += repsize;
4235 }
4236 }
4237 else
4238 return -1;
4239 return 0;
4240}
4241
4242PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004243 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004244 PyObject *mapping,
4245 const char *errors)
4246{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 /* output object */
4248 PyObject *res = NULL;
4249 /* pointers to the beginning and end+1 of input */
4250 const Py_UNICODE *startp = p;
4251 const Py_UNICODE *endp = p + size;
4252 /* pointer into the output */
4253 Py_UNICODE *str;
4254 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004255 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 char *reason = "character maps to <undefined>";
4257 PyObject *errorHandler = NULL;
4258 PyObject *exc = NULL;
4259 /* the following variable is used for caching string comparisons
4260 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4261 * 3=ignore, 4=xmlcharrefreplace */
4262 int known_errorHandler = -1;
4263
Guido van Rossumd57fd912000-03-10 22:53:23 +00004264 if (mapping == NULL) {
4265 PyErr_BadArgument();
4266 return NULL;
4267 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268
4269 /* allocate enough for a simple 1:1 translation without
4270 replacements, if we need more, we'll resize */
4271 res = PyUnicode_FromUnicode(NULL, size);
4272 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004273 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004274 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275 return res;
4276 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 while (p<endp) {
4279 /* try to encode it */
4280 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004281 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 goto onError;
4284 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004285 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 if (x!=Py_None) /* it worked => adjust input pointer */
4287 ++p;
4288 else { /* untranslatable character */
4289 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004290 Py_ssize_t repsize;
4291 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292 Py_UNICODE *uni2;
4293 /* startpos for collecting untranslatable chars */
4294 const Py_UNICODE *collstart = p;
4295 const Py_UNICODE *collend = p+1;
4296 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298 /* find all untranslatable characters */
4299 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004300 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 goto onError;
4302 Py_XDECREF(x);
4303 if (x!=Py_None)
4304 break;
4305 ++collend;
4306 }
4307 /* cache callback name lookup
4308 * (if not done yet, i.e. it's the first error) */
4309 if (known_errorHandler==-1) {
4310 if ((errors==NULL) || (!strcmp(errors, "strict")))
4311 known_errorHandler = 1;
4312 else if (!strcmp(errors, "replace"))
4313 known_errorHandler = 2;
4314 else if (!strcmp(errors, "ignore"))
4315 known_errorHandler = 3;
4316 else if (!strcmp(errors, "xmlcharrefreplace"))
4317 known_errorHandler = 4;
4318 else
4319 known_errorHandler = 0;
4320 }
4321 switch (known_errorHandler) {
4322 case 1: /* strict */
4323 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4324 goto onError;
4325 case 2: /* replace */
4326 /* No need to check for space, this is a 1:1 replacement */
4327 for (coll = collstart; coll<collend; ++coll)
4328 *str++ = '?';
4329 /* fall through */
4330 case 3: /* ignore */
4331 p = collend;
4332 break;
4333 case 4: /* xmlcharrefreplace */
4334 /* generate replacement (temporarily (mis)uses p) */
4335 for (p = collstart; p < collend; ++p) {
4336 char buffer[2+29+1+1];
4337 char *cp;
4338 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004339 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004340 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4341 goto onError;
4342 for (cp = buffer; *cp; ++cp)
4343 *str++ = *cp;
4344 }
4345 p = collend;
4346 break;
4347 default:
4348 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4349 reason, startp, size, &exc,
4350 collstart-startp, collend-startp, &newpos);
4351 if (repunicode == NULL)
4352 goto onError;
4353 /* generate replacement */
4354 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004355 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4357 Py_DECREF(repunicode);
4358 goto onError;
4359 }
4360 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4361 *str++ = *uni2;
4362 p = startp + newpos;
4363 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 }
4365 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004366 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004367 /* Resize if we allocated to much */
4368 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004369 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004370 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004371 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372 }
4373 Py_XDECREF(exc);
4374 Py_XDECREF(errorHandler);
4375 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004377 onError:
4378 Py_XDECREF(res);
4379 Py_XDECREF(exc);
4380 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381 return NULL;
4382}
4383
4384PyObject *PyUnicode_Translate(PyObject *str,
4385 PyObject *mapping,
4386 const char *errors)
4387{
4388 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004389
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 str = PyUnicode_FromObject(str);
4391 if (str == NULL)
4392 goto onError;
4393 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4394 PyUnicode_GET_SIZE(str),
4395 mapping,
4396 errors);
4397 Py_DECREF(str);
4398 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004399
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400 onError:
4401 Py_XDECREF(str);
4402 return NULL;
4403}
Tim Petersced69f82003-09-16 20:30:58 +00004404
Guido van Rossum9e896b32000-04-05 20:11:21 +00004405/* --- Decimal Encoder ---------------------------------------------------- */
4406
4407int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004408 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004409 char *output,
4410 const char *errors)
4411{
4412 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 PyObject *errorHandler = NULL;
4414 PyObject *exc = NULL;
4415 const char *encoding = "decimal";
4416 const char *reason = "invalid decimal Unicode string";
4417 /* the following variable is used for caching string comparisons
4418 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4419 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004420
4421 if (output == NULL) {
4422 PyErr_BadArgument();
4423 return -1;
4424 }
4425
4426 p = s;
4427 end = s + length;
4428 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004430 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004432 Py_ssize_t repsize;
4433 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 Py_UNICODE *uni2;
4435 Py_UNICODE *collstart;
4436 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004437
Guido van Rossum9e896b32000-04-05 20:11:21 +00004438 if (Py_UNICODE_ISSPACE(ch)) {
4439 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004441 continue;
4442 }
4443 decimal = Py_UNICODE_TODECIMAL(ch);
4444 if (decimal >= 0) {
4445 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004447 continue;
4448 }
Guido van Rossumba477042000-04-06 18:18:10 +00004449 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004450 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004452 continue;
4453 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 /* All other characters are considered unencodable */
4455 collstart = p;
4456 collend = p+1;
4457 while (collend < end) {
4458 if ((0 < *collend && *collend < 256) ||
4459 !Py_UNICODE_ISSPACE(*collend) ||
4460 Py_UNICODE_TODECIMAL(*collend))
4461 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004462 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004463 /* cache callback name lookup
4464 * (if not done yet, i.e. it's the first error) */
4465 if (known_errorHandler==-1) {
4466 if ((errors==NULL) || (!strcmp(errors, "strict")))
4467 known_errorHandler = 1;
4468 else if (!strcmp(errors, "replace"))
4469 known_errorHandler = 2;
4470 else if (!strcmp(errors, "ignore"))
4471 known_errorHandler = 3;
4472 else if (!strcmp(errors, "xmlcharrefreplace"))
4473 known_errorHandler = 4;
4474 else
4475 known_errorHandler = 0;
4476 }
4477 switch (known_errorHandler) {
4478 case 1: /* strict */
4479 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4480 goto onError;
4481 case 2: /* replace */
4482 for (p = collstart; p < collend; ++p)
4483 *output++ = '?';
4484 /* fall through */
4485 case 3: /* ignore */
4486 p = collend;
4487 break;
4488 case 4: /* xmlcharrefreplace */
4489 /* generate replacement (temporarily (mis)uses p) */
4490 for (p = collstart; p < collend; ++p)
4491 output += sprintf(output, "&#%d;", (int)*p);
4492 p = collend;
4493 break;
4494 default:
4495 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4496 encoding, reason, s, length, &exc,
4497 collstart-s, collend-s, &newpos);
4498 if (repunicode == NULL)
4499 goto onError;
4500 /* generate replacement */
4501 repsize = PyUnicode_GET_SIZE(repunicode);
4502 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4503 Py_UNICODE ch = *uni2;
4504 if (Py_UNICODE_ISSPACE(ch))
4505 *output++ = ' ';
4506 else {
4507 decimal = Py_UNICODE_TODECIMAL(ch);
4508 if (decimal >= 0)
4509 *output++ = '0' + decimal;
4510 else if (0 < ch && ch < 256)
4511 *output++ = (char)ch;
4512 else {
4513 Py_DECREF(repunicode);
4514 raise_encode_exception(&exc, encoding,
4515 s, length, collstart-s, collend-s, reason);
4516 goto onError;
4517 }
4518 }
4519 }
4520 p = s + newpos;
4521 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004522 }
4523 }
4524 /* 0-terminate the output string */
4525 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 Py_XDECREF(exc);
4527 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004528 return 0;
4529
4530 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 Py_XDECREF(exc);
4532 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004533 return -1;
4534}
4535
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536/* --- Helpers ------------------------------------------------------------ */
4537
Fredrik Lundha50d2012006-05-26 17:04:58 +00004538#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004539
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004540#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004541#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004542#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004543
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004544Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004545STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4546{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004547 if (str[0] != other[0])
4548 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004549 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4550}
4551
Fredrik Lundhb9479482006-05-26 17:22:38 +00004552#define STRINGLIB_EMPTY unicode_empty
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00004553#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004554
Fredrik Lundha50d2012006-05-26 17:04:58 +00004555#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004556
4557#include "stringlib/count.h"
4558#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004559#include "stringlib/partition.h"
4560
Fredrik Lundhc8162812006-05-26 19:33:03 +00004561/* helper macro to fixup start/end slice values */
4562#define FIX_START_END(obj) \
4563 if (start < 0) \
4564 start += (obj)->length; \
4565 if (start < 0) \
4566 start = 0; \
4567 if (end > (obj)->length) \
4568 end = (obj)->length; \
4569 if (end < 0) \
4570 end += (obj)->length; \
4571 if (end < 0) \
4572 end = 0;
4573
Martin v. Löwis18e16552006-02-15 17:27:45 +00004574Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004575 PyObject *substr,
4576 Py_ssize_t start,
4577 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004579 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004580 PyUnicodeObject* str_obj;
4581 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004582
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004583 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4584 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004586 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4587 if (!sub_obj) {
4588 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589 return -1;
4590 }
Tim Petersced69f82003-09-16 20:30:58 +00004591
Fredrik Lundhc8162812006-05-26 19:33:03 +00004592 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004593
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004594 result = stringlib_count(
4595 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4596 );
4597
4598 Py_DECREF(sub_obj);
4599 Py_DECREF(str_obj);
4600
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601 return result;
4602}
4603
Martin v. Löwis18e16552006-02-15 17:27:45 +00004604Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004605 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004606 Py_ssize_t start,
4607 Py_ssize_t end,
4608 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004610 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004611
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004612 str = PyUnicode_FromObject(str);
4613 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004614 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004615 sub = PyUnicode_FromObject(sub);
4616 if (!sub) {
4617 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004618 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619 }
Tim Petersced69f82003-09-16 20:30:58 +00004620
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004621 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004622 result = stringlib_find_slice(
4623 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4624 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4625 start, end
4626 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004627 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004628 result = stringlib_rfind_slice(
4629 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4630 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4631 start, end
4632 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004633
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004634 Py_DECREF(str);
4635 Py_DECREF(sub);
4636
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637 return result;
4638}
4639
Tim Petersced69f82003-09-16 20:30:58 +00004640static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004641int tailmatch(PyUnicodeObject *self,
4642 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004643 Py_ssize_t start,
4644 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645 int direction)
4646{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647 if (substring->length == 0)
4648 return 1;
4649
Fredrik Lundhc8162812006-05-26 19:33:03 +00004650 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651
4652 end -= substring->length;
4653 if (end < start)
4654 return 0;
4655
4656 if (direction > 0) {
4657 if (Py_UNICODE_MATCH(self, end, substring))
4658 return 1;
4659 } else {
4660 if (Py_UNICODE_MATCH(self, start, substring))
4661 return 1;
4662 }
4663
4664 return 0;
4665}
4666
Martin v. Löwis18e16552006-02-15 17:27:45 +00004667Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004669 Py_ssize_t start,
4670 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671 int direction)
4672{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004673 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004674
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675 str = PyUnicode_FromObject(str);
4676 if (str == NULL)
4677 return -1;
4678 substr = PyUnicode_FromObject(substr);
4679 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004680 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681 return -1;
4682 }
Tim Petersced69f82003-09-16 20:30:58 +00004683
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 result = tailmatch((PyUnicodeObject *)str,
4685 (PyUnicodeObject *)substr,
4686 start, end, direction);
4687 Py_DECREF(str);
4688 Py_DECREF(substr);
4689 return result;
4690}
4691
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692/* Apply fixfct filter to the Unicode object self and return a
4693 reference to the modified object */
4694
Tim Petersced69f82003-09-16 20:30:58 +00004695static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696PyObject *fixup(PyUnicodeObject *self,
4697 int (*fixfct)(PyUnicodeObject *s))
4698{
4699
4700 PyUnicodeObject *u;
4701
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004702 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703 if (u == NULL)
4704 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004705
4706 Py_UNICODE_COPY(u->str, self->str, self->length);
4707
Tim Peters7a29bd52001-09-12 03:03:31 +00004708 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709 /* fixfct should return TRUE if it modified the buffer. If
4710 FALSE, return a reference to the original buffer instead
4711 (to save space, not time) */
4712 Py_INCREF(self);
4713 Py_DECREF(u);
4714 return (PyObject*) self;
4715 }
4716 return (PyObject*) u;
4717}
4718
Tim Petersced69f82003-09-16 20:30:58 +00004719static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720int fixupper(PyUnicodeObject *self)
4721{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004722 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723 Py_UNICODE *s = self->str;
4724 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004725
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726 while (len-- > 0) {
4727 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004728
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 ch = Py_UNICODE_TOUPPER(*s);
4730 if (ch != *s) {
4731 status = 1;
4732 *s = ch;
4733 }
4734 s++;
4735 }
4736
4737 return status;
4738}
4739
Tim Petersced69f82003-09-16 20:30:58 +00004740static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741int fixlower(PyUnicodeObject *self)
4742{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004743 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 Py_UNICODE *s = self->str;
4745 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004746
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 while (len-- > 0) {
4748 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004749
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 ch = Py_UNICODE_TOLOWER(*s);
4751 if (ch != *s) {
4752 status = 1;
4753 *s = ch;
4754 }
4755 s++;
4756 }
4757
4758 return status;
4759}
4760
Tim Petersced69f82003-09-16 20:30:58 +00004761static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762int fixswapcase(PyUnicodeObject *self)
4763{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004764 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765 Py_UNICODE *s = self->str;
4766 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004767
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 while (len-- > 0) {
4769 if (Py_UNICODE_ISUPPER(*s)) {
4770 *s = Py_UNICODE_TOLOWER(*s);
4771 status = 1;
4772 } else if (Py_UNICODE_ISLOWER(*s)) {
4773 *s = Py_UNICODE_TOUPPER(*s);
4774 status = 1;
4775 }
4776 s++;
4777 }
4778
4779 return status;
4780}
4781
Tim Petersced69f82003-09-16 20:30:58 +00004782static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783int fixcapitalize(PyUnicodeObject *self)
4784{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004785 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004786 Py_UNICODE *s = self->str;
4787 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004788
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004789 if (len == 0)
4790 return 0;
4791 if (Py_UNICODE_ISLOWER(*s)) {
4792 *s = Py_UNICODE_TOUPPER(*s);
4793 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004795 s++;
4796 while (--len > 0) {
4797 if (Py_UNICODE_ISUPPER(*s)) {
4798 *s = Py_UNICODE_TOLOWER(*s);
4799 status = 1;
4800 }
4801 s++;
4802 }
4803 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804}
4805
4806static
4807int fixtitle(PyUnicodeObject *self)
4808{
4809 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4810 register Py_UNICODE *e;
4811 int previous_is_cased;
4812
4813 /* Shortcut for single character strings */
4814 if (PyUnicode_GET_SIZE(self) == 1) {
4815 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4816 if (*p != ch) {
4817 *p = ch;
4818 return 1;
4819 }
4820 else
4821 return 0;
4822 }
Tim Petersced69f82003-09-16 20:30:58 +00004823
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 e = p + PyUnicode_GET_SIZE(self);
4825 previous_is_cased = 0;
4826 for (; p < e; p++) {
4827 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004828
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 if (previous_is_cased)
4830 *p = Py_UNICODE_TOLOWER(ch);
4831 else
4832 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004833
4834 if (Py_UNICODE_ISLOWER(ch) ||
4835 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836 Py_UNICODE_ISTITLE(ch))
4837 previous_is_cased = 1;
4838 else
4839 previous_is_cased = 0;
4840 }
4841 return 1;
4842}
4843
Tim Peters8ce9f162004-08-27 01:49:32 +00004844PyObject *
4845PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846{
Tim Peters8ce9f162004-08-27 01:49:32 +00004847 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004848 const Py_UNICODE blank = ' ';
4849 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004850 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004851 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004852 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4853 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004854 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4855 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004856 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004857 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004858 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859
Tim Peters05eba1f2004-08-27 21:32:02 +00004860 fseq = PySequence_Fast(seq, "");
4861 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004862 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004863 }
4864
Tim Peters91879ab2004-08-27 22:35:44 +00004865 /* Grrrr. A codec may be invoked to convert str objects to
4866 * Unicode, and so it's possible to call back into Python code
4867 * during PyUnicode_FromObject(), and so it's possible for a sick
4868 * codec to change the size of fseq (if seq is a list). Therefore
4869 * we have to keep refetching the size -- can't assume seqlen
4870 * is invariant.
4871 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004872 seqlen = PySequence_Fast_GET_SIZE(fseq);
4873 /* If empty sequence, return u"". */
4874 if (seqlen == 0) {
4875 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4876 goto Done;
4877 }
4878 /* If singleton sequence with an exact Unicode, return that. */
4879 if (seqlen == 1) {
4880 item = PySequence_Fast_GET_ITEM(fseq, 0);
4881 if (PyUnicode_CheckExact(item)) {
4882 Py_INCREF(item);
4883 res = (PyUnicodeObject *)item;
4884 goto Done;
4885 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004886 }
4887
Tim Peters05eba1f2004-08-27 21:32:02 +00004888 /* At least two items to join, or one that isn't exact Unicode. */
4889 if (seqlen > 1) {
4890 /* Set up sep and seplen -- they're needed. */
4891 if (separator == NULL) {
4892 sep = &blank;
4893 seplen = 1;
4894 }
4895 else {
4896 internal_separator = PyUnicode_FromObject(separator);
4897 if (internal_separator == NULL)
4898 goto onError;
4899 sep = PyUnicode_AS_UNICODE(internal_separator);
4900 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004901 /* In case PyUnicode_FromObject() mutated seq. */
4902 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004903 }
4904 }
4905
4906 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004907 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004908 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004909 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004910 res_p = PyUnicode_AS_UNICODE(res);
4911 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004912
Tim Peters05eba1f2004-08-27 21:32:02 +00004913 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004914 Py_ssize_t itemlen;
4915 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004916
4917 item = PySequence_Fast_GET_ITEM(fseq, i);
4918 /* Convert item to Unicode. */
4919 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4920 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004921 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004922 " %.80s found",
Martin v. Löwis68192102007-07-21 06:55:02 +00004923 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004924 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004925 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004926 item = PyUnicode_FromObject(item);
4927 if (item == NULL)
4928 goto onError;
4929 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004930
Tim Peters91879ab2004-08-27 22:35:44 +00004931 /* In case PyUnicode_FromObject() mutated seq. */
4932 seqlen = PySequence_Fast_GET_SIZE(fseq);
4933
Tim Peters8ce9f162004-08-27 01:49:32 +00004934 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004936 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004937 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004938 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004939 if (i < seqlen - 1) {
4940 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004941 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004942 goto Overflow;
4943 }
4944 if (new_res_used > res_alloc) {
4945 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004946 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004947 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004948 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004949 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004950 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004951 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004952 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004954 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004955 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004957
4958 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004959 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004960 res_p += itemlen;
4961 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004962 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004963 res_p += seplen;
4964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004966 res_used = new_res_used;
4967 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004968
Tim Peters05eba1f2004-08-27 21:32:02 +00004969 /* Shrink res to match the used area; this probably can't fail,
4970 * but it's cheap to check.
4971 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004972 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004973 goto onError;
4974
4975 Done:
4976 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004977 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978 return (PyObject *)res;
4979
Tim Peters8ce9f162004-08-27 01:49:32 +00004980 Overflow:
4981 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004982 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004983 Py_DECREF(item);
4984 /* fall through */
4985
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004987 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004988 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004989 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 return NULL;
4991}
4992
Tim Petersced69f82003-09-16 20:30:58 +00004993static
4994PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004995 Py_ssize_t left,
4996 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997 Py_UNICODE fill)
4998{
4999 PyUnicodeObject *u;
5000
5001 if (left < 0)
5002 left = 0;
5003 if (right < 0)
5004 right = 0;
5005
Tim Peters7a29bd52001-09-12 03:03:31 +00005006 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007 Py_INCREF(self);
5008 return self;
5009 }
5010
5011 u = _PyUnicode_New(left + self->length + right);
5012 if (u) {
5013 if (left)
5014 Py_UNICODE_FILL(u->str, fill, left);
5015 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5016 if (right)
5017 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5018 }
5019
5020 return u;
5021}
5022
5023#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005024 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025 if (!str) \
5026 goto onError; \
5027 if (PyList_Append(list, str)) { \
5028 Py_DECREF(str); \
5029 goto onError; \
5030 } \
5031 else \
5032 Py_DECREF(str);
5033
5034static
5035PyObject *split_whitespace(PyUnicodeObject *self,
5036 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005037 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005038{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005039 register Py_ssize_t i;
5040 register Py_ssize_t j;
5041 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042 PyObject *str;
5043
5044 for (i = j = 0; i < len; ) {
5045 /* find a token */
5046 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5047 i++;
5048 j = i;
5049 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5050 i++;
5051 if (j < i) {
5052 if (maxcount-- <= 0)
5053 break;
5054 SPLIT_APPEND(self->str, j, i);
5055 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5056 i++;
5057 j = i;
5058 }
5059 }
5060 if (j < len) {
5061 SPLIT_APPEND(self->str, j, len);
5062 }
5063 return list;
5064
5065 onError:
5066 Py_DECREF(list);
5067 return NULL;
5068}
5069
5070PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005071 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005073 register Py_ssize_t i;
5074 register Py_ssize_t j;
5075 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 PyObject *list;
5077 PyObject *str;
5078 Py_UNICODE *data;
5079
5080 string = PyUnicode_FromObject(string);
5081 if (string == NULL)
5082 return NULL;
5083 data = PyUnicode_AS_UNICODE(string);
5084 len = PyUnicode_GET_SIZE(string);
5085
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086 list = PyList_New(0);
5087 if (!list)
5088 goto onError;
5089
5090 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005091 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005092
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005094 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096
5097 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005098 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099 if (i < len) {
5100 if (data[i] == '\r' && i + 1 < len &&
5101 data[i+1] == '\n')
5102 i += 2;
5103 else
5104 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005105 if (keepends)
5106 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 }
Guido van Rossum86662912000-04-11 15:38:46 +00005108 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 j = i;
5110 }
5111 if (j < len) {
5112 SPLIT_APPEND(data, j, len);
5113 }
5114
5115 Py_DECREF(string);
5116 return list;
5117
5118 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005119 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 Py_DECREF(string);
5121 return NULL;
5122}
5123
Tim Petersced69f82003-09-16 20:30:58 +00005124static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125PyObject *split_char(PyUnicodeObject *self,
5126 PyObject *list,
5127 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005128 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005130 register Py_ssize_t i;
5131 register Py_ssize_t j;
5132 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133 PyObject *str;
5134
5135 for (i = j = 0; i < len; ) {
5136 if (self->str[i] == ch) {
5137 if (maxcount-- <= 0)
5138 break;
5139 SPLIT_APPEND(self->str, j, i);
5140 i = j = i + 1;
5141 } else
5142 i++;
5143 }
5144 if (j <= len) {
5145 SPLIT_APPEND(self->str, j, len);
5146 }
5147 return list;
5148
5149 onError:
5150 Py_DECREF(list);
5151 return NULL;
5152}
5153
Tim Petersced69f82003-09-16 20:30:58 +00005154static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155PyObject *split_substring(PyUnicodeObject *self,
5156 PyObject *list,
5157 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005158 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005160 register Py_ssize_t i;
5161 register Py_ssize_t j;
5162 Py_ssize_t len = self->length;
5163 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 PyObject *str;
5165
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005166 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 if (Py_UNICODE_MATCH(self, i, substring)) {
5168 if (maxcount-- <= 0)
5169 break;
5170 SPLIT_APPEND(self->str, j, i);
5171 i = j = i + sublen;
5172 } else
5173 i++;
5174 }
5175 if (j <= len) {
5176 SPLIT_APPEND(self->str, j, len);
5177 }
5178 return list;
5179
5180 onError:
5181 Py_DECREF(list);
5182 return NULL;
5183}
5184
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005185static
5186PyObject *rsplit_whitespace(PyUnicodeObject *self,
5187 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005188 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005189{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005190 register Py_ssize_t i;
5191 register Py_ssize_t j;
5192 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005193 PyObject *str;
5194
5195 for (i = j = len - 1; i >= 0; ) {
5196 /* find a token */
5197 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5198 i--;
5199 j = i;
5200 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5201 i--;
5202 if (j > i) {
5203 if (maxcount-- <= 0)
5204 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005205 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005206 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5207 i--;
5208 j = i;
5209 }
5210 }
5211 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005212 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005213 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005214 if (PyList_Reverse(list) < 0)
5215 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005216 return list;
5217
5218 onError:
5219 Py_DECREF(list);
5220 return NULL;
5221}
5222
5223static
5224PyObject *rsplit_char(PyUnicodeObject *self,
5225 PyObject *list,
5226 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005227 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005228{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005229 register Py_ssize_t i;
5230 register Py_ssize_t j;
5231 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005232 PyObject *str;
5233
5234 for (i = j = len - 1; i >= 0; ) {
5235 if (self->str[i] == ch) {
5236 if (maxcount-- <= 0)
5237 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005238 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005239 j = i = i - 1;
5240 } else
5241 i--;
5242 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005243 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005244 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005245 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005246 if (PyList_Reverse(list) < 0)
5247 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005248 return list;
5249
5250 onError:
5251 Py_DECREF(list);
5252 return NULL;
5253}
5254
5255static
5256PyObject *rsplit_substring(PyUnicodeObject *self,
5257 PyObject *list,
5258 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005259 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005260{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005261 register Py_ssize_t i;
5262 register Py_ssize_t j;
5263 Py_ssize_t len = self->length;
5264 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005265 PyObject *str;
5266
5267 for (i = len - sublen, j = len; i >= 0; ) {
5268 if (Py_UNICODE_MATCH(self, i, substring)) {
5269 if (maxcount-- <= 0)
5270 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005271 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005272 j = i;
5273 i -= sublen;
5274 } else
5275 i--;
5276 }
5277 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005278 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005279 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005280 if (PyList_Reverse(list) < 0)
5281 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005282 return list;
5283
5284 onError:
5285 Py_DECREF(list);
5286 return NULL;
5287}
5288
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289#undef SPLIT_APPEND
5290
5291static
5292PyObject *split(PyUnicodeObject *self,
5293 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005294 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295{
5296 PyObject *list;
5297
5298 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005299 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300
5301 list = PyList_New(0);
5302 if (!list)
5303 return NULL;
5304
5305 if (substring == NULL)
5306 return split_whitespace(self,list,maxcount);
5307
5308 else if (substring->length == 1)
5309 return split_char(self,list,substring->str[0],maxcount);
5310
5311 else if (substring->length == 0) {
5312 Py_DECREF(list);
5313 PyErr_SetString(PyExc_ValueError, "empty separator");
5314 return NULL;
5315 }
5316 else
5317 return split_substring(self,list,substring,maxcount);
5318}
5319
Tim Petersced69f82003-09-16 20:30:58 +00005320static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005321PyObject *rsplit(PyUnicodeObject *self,
5322 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005323 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005324{
5325 PyObject *list;
5326
5327 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005328 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005329
5330 list = PyList_New(0);
5331 if (!list)
5332 return NULL;
5333
5334 if (substring == NULL)
5335 return rsplit_whitespace(self,list,maxcount);
5336
5337 else if (substring->length == 1)
5338 return rsplit_char(self,list,substring->str[0],maxcount);
5339
5340 else if (substring->length == 0) {
5341 Py_DECREF(list);
5342 PyErr_SetString(PyExc_ValueError, "empty separator");
5343 return NULL;
5344 }
5345 else
5346 return rsplit_substring(self,list,substring,maxcount);
5347}
5348
5349static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350PyObject *replace(PyUnicodeObject *self,
5351 PyUnicodeObject *str1,
5352 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005353 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354{
5355 PyUnicodeObject *u;
5356
5357 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005358 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359
Fredrik Lundh347ee272006-05-24 16:35:18 +00005360 if (str1->length == str2->length) {
5361 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005362 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005363 if (str1->length == 1) {
5364 /* replace characters */
5365 Py_UNICODE u1, u2;
5366 if (!findchar(self->str, self->length, str1->str[0]))
5367 goto nothing;
5368 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5369 if (!u)
5370 return NULL;
5371 Py_UNICODE_COPY(u->str, self->str, self->length);
5372 u1 = str1->str[0];
5373 u2 = str2->str[0];
5374 for (i = 0; i < u->length; i++)
5375 if (u->str[i] == u1) {
5376 if (--maxcount < 0)
5377 break;
5378 u->str[i] = u2;
5379 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005381 i = fastsearch(
5382 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005384 if (i < 0)
5385 goto nothing;
5386 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5387 if (!u)
5388 return NULL;
5389 Py_UNICODE_COPY(u->str, self->str, self->length);
5390 while (i <= self->length - str1->length)
5391 if (Py_UNICODE_MATCH(self, i, str1)) {
5392 if (--maxcount < 0)
5393 break;
5394 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5395 i += str1->length;
5396 } else
5397 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005400
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005401 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005402 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 Py_UNICODE *p;
5404
5405 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005406 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 if (n > maxcount)
5408 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005409 if (n == 0)
5410 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005411 /* new_size = self->length + n * (str2->length - str1->length)); */
5412 delta = (str2->length - str1->length);
5413 if (delta == 0) {
5414 new_size = self->length;
5415 } else {
5416 product = n * (str2->length - str1->length);
5417 if ((product / (str2->length - str1->length)) != n) {
5418 PyErr_SetString(PyExc_OverflowError,
5419 "replace string is too long");
5420 return NULL;
5421 }
5422 new_size = self->length + product;
5423 if (new_size < 0) {
5424 PyErr_SetString(PyExc_OverflowError,
5425 "replace string is too long");
5426 return NULL;
5427 }
5428 }
5429 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005430 if (!u)
5431 return NULL;
5432 i = 0;
5433 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005434 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005435 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005436 while (n-- > 0) {
5437 /* look for next match */
5438 j = i;
5439 while (j <= e) {
5440 if (Py_UNICODE_MATCH(self, j, str1))
5441 break;
5442 j++;
5443 }
5444 if (j > i) {
5445 if (j > e)
5446 break;
5447 /* copy unchanged part [i:j] */
5448 Py_UNICODE_COPY(p, self->str+i, j-i);
5449 p += j - i;
5450 }
5451 /* copy substitution string */
5452 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005453 Py_UNICODE_COPY(p, str2->str, str2->length);
5454 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005455 }
5456 i = j + str1->length;
5457 }
5458 if (i < self->length)
5459 /* copy tail [i:] */
5460 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005461 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005462 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005463 while (n > 0) {
5464 Py_UNICODE_COPY(p, str2->str, str2->length);
5465 p += str2->length;
5466 if (--n <= 0)
5467 break;
5468 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005470 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 }
5472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005474
5475nothing:
5476 /* nothing to replace; return original string (when possible) */
5477 if (PyUnicode_CheckExact(self)) {
5478 Py_INCREF(self);
5479 return (PyObject *) self;
5480 }
5481 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482}
5483
5484/* --- Unicode Object Methods --------------------------------------------- */
5485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005486PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487"S.title() -> unicode\n\
5488\n\
5489Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005490characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491
5492static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005493unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 return fixup(self, fixtitle);
5496}
5497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005498PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499"S.capitalize() -> unicode\n\
5500\n\
5501Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005502have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503
5504static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005505unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 return fixup(self, fixcapitalize);
5508}
5509
5510#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005511PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512"S.capwords() -> unicode\n\
5513\n\
5514Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005515normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516
5517static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005518unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519{
5520 PyObject *list;
5521 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005522 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524 /* Split into words */
5525 list = split(self, NULL, -1);
5526 if (!list)
5527 return NULL;
5528
5529 /* Capitalize each word */
5530 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5531 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5532 fixcapitalize);
5533 if (item == NULL)
5534 goto onError;
5535 Py_DECREF(PyList_GET_ITEM(list, i));
5536 PyList_SET_ITEM(list, i, item);
5537 }
5538
5539 /* Join the words to form a new string */
5540 item = PyUnicode_Join(NULL, list);
5541
5542onError:
5543 Py_DECREF(list);
5544 return (PyObject *)item;
5545}
5546#endif
5547
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005548/* Argument converter. Coerces to a single unicode character */
5549
5550static int
5551convert_uc(PyObject *obj, void *addr)
5552{
5553 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5554 PyObject *uniobj;
5555 Py_UNICODE *unistr;
5556
5557 uniobj = PyUnicode_FromObject(obj);
5558 if (uniobj == NULL) {
5559 PyErr_SetString(PyExc_TypeError,
5560 "The fill character cannot be converted to Unicode");
5561 return 0;
5562 }
5563 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5564 PyErr_SetString(PyExc_TypeError,
5565 "The fill character must be exactly one character long");
5566 Py_DECREF(uniobj);
5567 return 0;
5568 }
5569 unistr = PyUnicode_AS_UNICODE(uniobj);
5570 *fillcharloc = unistr[0];
5571 Py_DECREF(uniobj);
5572 return 1;
5573}
5574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005575PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005576"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005578Return S centered in a Unicode string of length width. Padding is\n\
5579done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580
5581static PyObject *
5582unicode_center(PyUnicodeObject *self, PyObject *args)
5583{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005584 Py_ssize_t marg, left;
5585 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005586 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587
Thomas Woutersde017742006-02-16 19:34:37 +00005588 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 return NULL;
5590
Tim Peters7a29bd52001-09-12 03:03:31 +00005591 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 Py_INCREF(self);
5593 return (PyObject*) self;
5594 }
5595
5596 marg = width - self->length;
5597 left = marg / 2 + (marg & width & 1);
5598
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005599 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600}
5601
Marc-André Lemburge5034372000-08-08 08:04:29 +00005602#if 0
5603
5604/* This code should go into some future Unicode collation support
5605 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005606 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005607
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005608/* speedy UTF-16 code point order comparison */
5609/* gleaned from: */
5610/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5611
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005612static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005613{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005614 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005615 0, 0, 0, 0, 0, 0, 0, 0,
5616 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005617 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005618};
5619
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620static int
5621unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5622{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005623 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005624
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 Py_UNICODE *s1 = str1->str;
5626 Py_UNICODE *s2 = str2->str;
5627
5628 len1 = str1->length;
5629 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005630
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005632 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005633
5634 c1 = *s1++;
5635 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005636
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005637 if (c1 > (1<<11) * 26)
5638 c1 += utf16Fixup[c1>>11];
5639 if (c2 > (1<<11) * 26)
5640 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005641 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005642
5643 if (c1 != c2)
5644 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005645
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005646 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 }
5648
5649 return (len1 < len2) ? -1 : (len1 != len2);
5650}
5651
Marc-André Lemburge5034372000-08-08 08:04:29 +00005652#else
5653
5654static int
5655unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5656{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005657 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005658
5659 Py_UNICODE *s1 = str1->str;
5660 Py_UNICODE *s2 = str2->str;
5661
5662 len1 = str1->length;
5663 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005664
Marc-André Lemburge5034372000-08-08 08:04:29 +00005665 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005666 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005667
Fredrik Lundh45714e92001-06-26 16:39:36 +00005668 c1 = *s1++;
5669 c2 = *s2++;
5670
5671 if (c1 != c2)
5672 return (c1 < c2) ? -1 : 1;
5673
Marc-André Lemburge5034372000-08-08 08:04:29 +00005674 len1--; len2--;
5675 }
5676
5677 return (len1 < len2) ? -1 : (len1 != len2);
5678}
5679
5680#endif
5681
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682int PyUnicode_Compare(PyObject *left,
5683 PyObject *right)
5684{
5685 PyUnicodeObject *u = NULL, *v = NULL;
5686 int result;
5687
5688 /* Coerce the two arguments */
5689 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5690 if (u == NULL)
5691 goto onError;
5692 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5693 if (v == NULL)
5694 goto onError;
5695
Thomas Wouters7e474022000-07-16 12:04:32 +00005696 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 if (v == u) {
5698 Py_DECREF(u);
5699 Py_DECREF(v);
5700 return 0;
5701 }
5702
5703 result = unicode_compare(u, v);
5704
5705 Py_DECREF(u);
5706 Py_DECREF(v);
5707 return result;
5708
5709onError:
5710 Py_XDECREF(u);
5711 Py_XDECREF(v);
5712 return -1;
5713}
5714
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00005715PyObject *PyUnicode_RichCompare(PyObject *left,
5716 PyObject *right,
5717 int op)
5718{
5719 int result;
5720
5721 result = PyUnicode_Compare(left, right);
5722 if (result == -1 && PyErr_Occurred())
5723 goto onError;
5724
5725 /* Convert the return value to a Boolean */
5726 switch (op) {
5727 case Py_EQ:
5728 result = (result == 0);
5729 break;
5730 case Py_NE:
5731 result = (result != 0);
5732 break;
5733 case Py_LE:
5734 result = (result <= 0);
5735 break;
5736 case Py_GE:
5737 result = (result >= 0);
5738 break;
5739 case Py_LT:
5740 result = (result == -1);
5741 break;
5742 case Py_GT:
5743 result = (result == 1);
5744 break;
5745 }
5746 return PyBool_FromLong(result);
5747
5748 onError:
5749
5750 /* Standard case
5751
5752 Type errors mean that PyUnicode_FromObject() could not convert
5753 one of the arguments (usually the right hand side) to Unicode,
5754 ie. we can't handle the comparison request. However, it is
5755 possible that the other object knows a comparison method, which
5756 is why we return Py_NotImplemented to give the other object a
5757 chance.
5758
5759 */
5760 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5761 PyErr_Clear();
5762 Py_INCREF(Py_NotImplemented);
5763 return Py_NotImplemented;
5764 }
5765 if (op != Py_EQ && op != Py_NE)
5766 return NULL;
5767
5768 /* Equality comparison.
5769
5770 This is a special case: we silence any PyExc_UnicodeDecodeError
5771 and instead turn it into a PyErr_UnicodeWarning.
5772
5773 */
5774 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5775 return NULL;
5776 PyErr_Clear();
5777 if (PyErr_Warn(PyExc_UnicodeWarning,
5778 (op == Py_EQ) ?
5779 "Unicode equal comparison "
5780 "failed to convert both arguments to Unicode - "
5781 "interpreting them as being unequal" :
5782 "Unicode unequal comparison "
5783 "failed to convert both arguments to Unicode - "
5784 "interpreting them as being unequal"
5785 ) < 0)
5786 return NULL;
5787 result = (op == Py_NE);
5788 return PyBool_FromLong(result);
5789}
5790
Guido van Rossum403d68b2000-03-13 15:55:09 +00005791int PyUnicode_Contains(PyObject *container,
5792 PyObject *element)
5793{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005794 PyObject *str, *sub;
5795 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005796
5797 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005798 sub = PyUnicode_FromObject(element);
5799 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005800 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005801 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005802 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005803 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005804
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005805 str = PyUnicode_FromObject(container);
5806 if (!str) {
5807 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005808 return -1;
5809 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005810
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005811 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005812
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005813 Py_DECREF(str);
5814 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005815
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005816 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005817}
5818
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819/* Concat to string or Unicode object giving a new Unicode object. */
5820
5821PyObject *PyUnicode_Concat(PyObject *left,
5822 PyObject *right)
5823{
5824 PyUnicodeObject *u = NULL, *v = NULL, *w;
5825
5826 /* Coerce the two arguments */
5827 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5828 if (u == NULL)
5829 goto onError;
5830 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5831 if (v == NULL)
5832 goto onError;
5833
5834 /* Shortcuts */
5835 if (v == unicode_empty) {
5836 Py_DECREF(v);
5837 return (PyObject *)u;
5838 }
5839 if (u == unicode_empty) {
5840 Py_DECREF(u);
5841 return (PyObject *)v;
5842 }
5843
5844 /* Concat the two Unicode strings */
5845 w = _PyUnicode_New(u->length + v->length);
5846 if (w == NULL)
5847 goto onError;
5848 Py_UNICODE_COPY(w->str, u->str, u->length);
5849 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5850
5851 Py_DECREF(u);
5852 Py_DECREF(v);
5853 return (PyObject *)w;
5854
5855onError:
5856 Py_XDECREF(u);
5857 Py_XDECREF(v);
5858 return NULL;
5859}
5860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005861PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862"S.count(sub[, start[, end]]) -> int\n\
5863\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005864Return the number of non-overlapping occurrences of substring sub in\n\
5865Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005866interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867
5868static PyObject *
5869unicode_count(PyUnicodeObject *self, PyObject *args)
5870{
5871 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005872 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005873 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 PyObject *result;
5875
Guido van Rossumb8872e62000-05-09 14:14:27 +00005876 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5877 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 return NULL;
5879
5880 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005881 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 if (substring == NULL)
5883 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005884
Fredrik Lundhc8162812006-05-26 19:33:03 +00005885 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005887 result = PyInt_FromSsize_t(
5888 stringlib_count(self->str + start, end - start,
5889 substring->str, substring->length)
5890 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891
5892 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005893
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 return result;
5895}
5896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005897PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005898"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005900Encodes S using the codec registered for encoding. encoding defaults\n\
5901to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005902handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005903a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5904'xmlcharrefreplace' as well as any other name registered with\n\
5905codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906
5907static PyObject *
5908unicode_encode(PyUnicodeObject *self, PyObject *args)
5909{
5910 char *encoding = NULL;
5911 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005912 PyObject *v;
5913
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5915 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005916 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005917 if (v == NULL)
5918 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005919 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5920 PyErr_Format(PyExc_TypeError,
5921 "encoder did not return a string/unicode object "
5922 "(type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +00005923 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005924 Py_DECREF(v);
5925 return NULL;
5926 }
5927 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005928
5929 onError:
5930 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005931}
5932
5933PyDoc_STRVAR(decode__doc__,
5934"S.decode([encoding[,errors]]) -> string or unicode\n\
5935\n\
5936Decodes S using the codec registered for encoding. encoding defaults\n\
5937to the default encoding. errors may be given to set a different error\n\
5938handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5939a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5940as well as any other name registerd with codecs.register_error that is\n\
5941able to handle UnicodeDecodeErrors.");
5942
5943static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005944unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005945{
5946 char *encoding = NULL;
5947 char *errors = NULL;
5948 PyObject *v;
5949
5950 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5951 return NULL;
5952 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005953 if (v == NULL)
5954 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005955 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5956 PyErr_Format(PyExc_TypeError,
5957 "decoder did not return a string/unicode object "
5958 "(type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +00005959 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005960 Py_DECREF(v);
5961 return NULL;
5962 }
5963 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005964
5965 onError:
5966 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967}
5968
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005969PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970"S.expandtabs([tabsize]) -> unicode\n\
5971\n\
5972Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005973If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974
5975static PyObject*
5976unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5977{
5978 Py_UNICODE *e;
5979 Py_UNICODE *p;
5980 Py_UNICODE *q;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005981 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 PyUnicodeObject *u;
5983 int tabsize = 8;
5984
5985 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5986 return NULL;
5987
Thomas Wouters7e474022000-07-16 12:04:32 +00005988 /* First pass: determine size of output string */
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005989 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 e = self->str + self->length;
5991 for (p = self->str; p < e; p++)
5992 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005993 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 j += tabsize - (j % tabsize);
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005995 if (old_j > j) {
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00005996 PyErr_SetString(PyExc_OverflowError,
5997 "new string is too long");
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005998 return NULL;
5999 }
6000 old_j = j;
6001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 }
6003 else {
6004 j++;
6005 if (*p == '\n' || *p == '\r') {
6006 i += j;
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00006007 old_j = j = 0;
6008 if (i < 0) {
6009 PyErr_SetString(PyExc_OverflowError,
6010 "new string is too long");
6011 return NULL;
6012 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 }
6014 }
6015
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006016 if ((i + j) < 0) {
6017 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6018 return NULL;
6019 }
6020
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 /* Second pass: create output string and fill it */
6022 u = _PyUnicode_New(i + j);
6023 if (!u)
6024 return NULL;
6025
6026 j = 0;
6027 q = u->str;
6028
6029 for (p = self->str; p < e; p++)
6030 if (*p == '\t') {
6031 if (tabsize > 0) {
6032 i = tabsize - (j % tabsize);
6033 j += i;
6034 while (i--)
6035 *q++ = ' ';
6036 }
6037 }
6038 else {
6039 j++;
6040 *q++ = *p;
6041 if (*p == '\n' || *p == '\r')
6042 j = 0;
6043 }
6044
6045 return (PyObject*) u;
6046}
6047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006048PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049"S.find(sub [,start [,end]]) -> int\n\
6050\n\
6051Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006052such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053arguments start and end are interpreted as in slice notation.\n\
6054\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006055Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056
6057static PyObject *
6058unicode_find(PyUnicodeObject *self, PyObject *args)
6059{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006060 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006061 Py_ssize_t start;
6062 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006063 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064
Facundo Batista57d56692007-11-16 18:04:14 +00006065 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006068 result = stringlib_find_slice(
6069 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6070 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6071 start, end
6072 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073
6074 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006075
6076 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077}
6078
6079static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006080unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081{
6082 if (index < 0 || index >= self->length) {
6083 PyErr_SetString(PyExc_IndexError, "string index out of range");
6084 return NULL;
6085 }
6086
6087 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6088}
6089
6090static long
6091unicode_hash(PyUnicodeObject *self)
6092{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006093 /* Since Unicode objects compare equal to their ASCII string
6094 counterparts, they should use the individual character values
6095 as basis for their hash value. This is needed to assure that
6096 strings and Unicode objects behave in the same way as
6097 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098
Martin v. Löwis18e16552006-02-15 17:27:45 +00006099 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006100 register Py_UNICODE *p;
6101 register long x;
6102
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 if (self->hash != -1)
6104 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006105 len = PyUnicode_GET_SIZE(self);
6106 p = PyUnicode_AS_UNICODE(self);
6107 x = *p << 7;
6108 while (--len >= 0)
6109 x = (1000003*x) ^ *p++;
6110 x ^= PyUnicode_GET_SIZE(self);
6111 if (x == -1)
6112 x = -2;
6113 self->hash = x;
6114 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115}
6116
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006117PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118"S.index(sub [,start [,end]]) -> int\n\
6119\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006120Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121
6122static PyObject *
6123unicode_index(PyUnicodeObject *self, PyObject *args)
6124{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006125 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006126 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006127 Py_ssize_t start;
6128 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129
Facundo Batista57d56692007-11-16 18:04:14 +00006130 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006133 result = stringlib_find_slice(
6134 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6135 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6136 start, end
6137 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
6139 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006140
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 if (result < 0) {
6142 PyErr_SetString(PyExc_ValueError, "substring not found");
6143 return NULL;
6144 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006145
Martin v. Löwis18e16552006-02-15 17:27:45 +00006146 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147}
6148
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006149PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006150"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006152Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006153at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154
6155static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006156unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157{
6158 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6159 register const Py_UNICODE *e;
6160 int cased;
6161
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 /* Shortcut for single character strings */
6163 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006164 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006166 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006167 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006168 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006169
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 e = p + PyUnicode_GET_SIZE(self);
6171 cased = 0;
6172 for (; p < e; p++) {
6173 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006174
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006176 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177 else if (!cased && Py_UNICODE_ISLOWER(ch))
6178 cased = 1;
6179 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006180 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181}
6182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006183PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006184"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006186Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006187at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188
6189static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006190unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191{
6192 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6193 register const Py_UNICODE *e;
6194 int cased;
6195
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 /* Shortcut for single character strings */
6197 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006198 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006200 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006201 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006202 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006203
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 e = p + PyUnicode_GET_SIZE(self);
6205 cased = 0;
6206 for (; p < e; p++) {
6207 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006208
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006210 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 else if (!cased && Py_UNICODE_ISUPPER(ch))
6212 cased = 1;
6213 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006214 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215}
6216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006217PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006218"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006220Return True if S is a titlecased string and there is at least one\n\
6221character in S, i.e. upper- and titlecase characters may only\n\
6222follow uncased characters and lowercase characters only cased ones.\n\
6223Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224
6225static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006226unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227{
6228 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6229 register const Py_UNICODE *e;
6230 int cased, previous_is_cased;
6231
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 /* Shortcut for single character strings */
6233 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006234 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6235 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006237 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006238 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006239 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006240
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 e = p + PyUnicode_GET_SIZE(self);
6242 cased = 0;
6243 previous_is_cased = 0;
6244 for (; p < e; p++) {
6245 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006246
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6248 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006249 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 previous_is_cased = 1;
6251 cased = 1;
6252 }
6253 else if (Py_UNICODE_ISLOWER(ch)) {
6254 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006255 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 previous_is_cased = 1;
6257 cased = 1;
6258 }
6259 else
6260 previous_is_cased = 0;
6261 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006262 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263}
6264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006265PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006266"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006268Return True if all characters in S are whitespace\n\
6269and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270
6271static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006272unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273{
6274 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6275 register const Py_UNICODE *e;
6276
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 /* Shortcut for single character strings */
6278 if (PyUnicode_GET_SIZE(self) == 1 &&
6279 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006280 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006282 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006283 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006284 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006285
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 e = p + PyUnicode_GET_SIZE(self);
6287 for (; p < e; p++) {
6288 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006289 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006291 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292}
6293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006294PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006295"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006296\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006297Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006298and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006299
6300static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006301unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006302{
6303 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6304 register const Py_UNICODE *e;
6305
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006306 /* Shortcut for single character strings */
6307 if (PyUnicode_GET_SIZE(self) == 1 &&
6308 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006309 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006310
6311 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006312 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006313 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006314
6315 e = p + PyUnicode_GET_SIZE(self);
6316 for (; p < e; p++) {
6317 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006318 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006319 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006320 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006321}
6322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006323PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006324"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006325\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006326Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006327and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006328
6329static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006330unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006331{
6332 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6333 register const Py_UNICODE *e;
6334
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006335 /* Shortcut for single character strings */
6336 if (PyUnicode_GET_SIZE(self) == 1 &&
6337 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006338 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006339
6340 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006341 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006342 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006343
6344 e = p + PyUnicode_GET_SIZE(self);
6345 for (; p < e; p++) {
6346 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006347 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006348 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006349 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006350}
6351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006352PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006353"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006355Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006356False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357
6358static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006359unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360{
6361 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6362 register const Py_UNICODE *e;
6363
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364 /* Shortcut for single character strings */
6365 if (PyUnicode_GET_SIZE(self) == 1 &&
6366 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006367 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006369 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006370 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006371 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006372
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373 e = p + PyUnicode_GET_SIZE(self);
6374 for (; p < e; p++) {
6375 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006376 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006378 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379}
6380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006381PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006382"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006384Return True if all characters in S are digits\n\
6385and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386
6387static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006388unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389{
6390 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6391 register const Py_UNICODE *e;
6392
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393 /* Shortcut for single character strings */
6394 if (PyUnicode_GET_SIZE(self) == 1 &&
6395 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006396 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006398 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006399 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006400 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006401
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 e = p + PyUnicode_GET_SIZE(self);
6403 for (; p < e; p++) {
6404 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006405 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006407 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408}
6409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006410PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006411"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006413Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006414False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415
6416static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006417unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418{
6419 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6420 register const Py_UNICODE *e;
6421
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 /* Shortcut for single character strings */
6423 if (PyUnicode_GET_SIZE(self) == 1 &&
6424 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006425 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006427 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006428 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006429 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006430
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 e = p + PyUnicode_GET_SIZE(self);
6432 for (; p < e; p++) {
6433 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006434 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006436 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437}
6438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006439PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440"S.join(sequence) -> unicode\n\
6441\n\
6442Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006443sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444
6445static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006446unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006448 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449}
6450
Martin v. Löwis18e16552006-02-15 17:27:45 +00006451static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452unicode_length(PyUnicodeObject *self)
6453{
6454 return self->length;
6455}
6456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006457PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006458"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459\n\
6460Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006461done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462
6463static PyObject *
6464unicode_ljust(PyUnicodeObject *self, PyObject *args)
6465{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006466 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006467 Py_UNICODE fillchar = ' ';
6468
Martin v. Löwis412fb672006-04-13 06:34:32 +00006469 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 return NULL;
6471
Tim Peters7a29bd52001-09-12 03:03:31 +00006472 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 Py_INCREF(self);
6474 return (PyObject*) self;
6475 }
6476
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006477 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478}
6479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006480PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481"S.lower() -> unicode\n\
6482\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006483Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484
6485static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006486unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 return fixup(self, fixlower);
6489}
6490
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006491#define LEFTSTRIP 0
6492#define RIGHTSTRIP 1
6493#define BOTHSTRIP 2
6494
6495/* Arrays indexed by above */
6496static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6497
6498#define STRIPNAME(i) (stripformat[i]+3)
6499
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006500/* externally visible for str.strip(unicode) */
6501PyObject *
6502_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6503{
6504 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006505 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006506 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006507 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6508 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006509
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006510 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6511
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006512 i = 0;
6513 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006514 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6515 i++;
6516 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006517 }
6518
6519 j = len;
6520 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006521 do {
6522 j--;
6523 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6524 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006525 }
6526
6527 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006528 Py_INCREF(self);
6529 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006530 }
6531 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006532 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006533}
6534
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535
6536static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006537do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006539 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006540 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006541
6542 i = 0;
6543 if (striptype != RIGHTSTRIP) {
6544 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6545 i++;
6546 }
6547 }
6548
6549 j = len;
6550 if (striptype != LEFTSTRIP) {
6551 do {
6552 j--;
6553 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6554 j++;
6555 }
6556
6557 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6558 Py_INCREF(self);
6559 return (PyObject*)self;
6560 }
6561 else
6562 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563}
6564
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006565
6566static PyObject *
6567do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6568{
6569 PyObject *sep = NULL;
6570
6571 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6572 return NULL;
6573
6574 if (sep != NULL && sep != Py_None) {
6575 if (PyUnicode_Check(sep))
6576 return _PyUnicode_XStrip(self, striptype, sep);
6577 else if (PyString_Check(sep)) {
6578 PyObject *res;
6579 sep = PyUnicode_FromObject(sep);
6580 if (sep==NULL)
6581 return NULL;
6582 res = _PyUnicode_XStrip(self, striptype, sep);
6583 Py_DECREF(sep);
6584 return res;
6585 }
6586 else {
6587 PyErr_Format(PyExc_TypeError,
6588 "%s arg must be None, unicode or str",
6589 STRIPNAME(striptype));
6590 return NULL;
6591 }
6592 }
6593
6594 return do_strip(self, striptype);
6595}
6596
6597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006598PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006599"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006600\n\
6601Return a copy of the string S with leading and trailing\n\
6602whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006603If chars is given and not None, remove characters in chars instead.\n\
6604If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006605
6606static PyObject *
6607unicode_strip(PyUnicodeObject *self, PyObject *args)
6608{
6609 if (PyTuple_GET_SIZE(args) == 0)
6610 return do_strip(self, BOTHSTRIP); /* Common case */
6611 else
6612 return do_argstrip(self, BOTHSTRIP, args);
6613}
6614
6615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006616PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006617"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006618\n\
6619Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006620If chars is given and not None, remove characters in chars instead.\n\
6621If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006622
6623static PyObject *
6624unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6625{
6626 if (PyTuple_GET_SIZE(args) == 0)
6627 return do_strip(self, LEFTSTRIP); /* Common case */
6628 else
6629 return do_argstrip(self, LEFTSTRIP, args);
6630}
6631
6632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006633PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006634"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006635\n\
6636Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006637If chars is given and not None, remove characters in chars instead.\n\
6638If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006639
6640static PyObject *
6641unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6642{
6643 if (PyTuple_GET_SIZE(args) == 0)
6644 return do_strip(self, RIGHTSTRIP); /* Common case */
6645 else
6646 return do_argstrip(self, RIGHTSTRIP, args);
6647}
6648
6649
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006651unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652{
6653 PyUnicodeObject *u;
6654 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006655 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006656 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657
6658 if (len < 0)
6659 len = 0;
6660
Tim Peters7a29bd52001-09-12 03:03:31 +00006661 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 /* no repeat, return original string */
6663 Py_INCREF(str);
6664 return (PyObject*) str;
6665 }
Tim Peters8f422462000-09-09 06:13:41 +00006666
6667 /* ensure # of chars needed doesn't overflow int and # of bytes
6668 * needed doesn't overflow size_t
6669 */
6670 nchars = len * str->length;
6671 if (len && nchars / len != str->length) {
6672 PyErr_SetString(PyExc_OverflowError,
6673 "repeated string is too long");
6674 return NULL;
6675 }
6676 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6677 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6678 PyErr_SetString(PyExc_OverflowError,
6679 "repeated string is too long");
6680 return NULL;
6681 }
6682 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 if (!u)
6684 return NULL;
6685
6686 p = u->str;
6687
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006688 if (str->length == 1 && len > 0) {
6689 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006690 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006691 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006692 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006693 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006694 done = str->length;
6695 }
6696 while (done < nchars) {
6697 int n = (done <= nchars-done) ? done : nchars-done;
6698 Py_UNICODE_COPY(p+done, p, n);
6699 done += n;
6700 }
6701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702
6703 return (PyObject*) u;
6704}
6705
6706PyObject *PyUnicode_Replace(PyObject *obj,
6707 PyObject *subobj,
6708 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006709 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710{
6711 PyObject *self;
6712 PyObject *str1;
6713 PyObject *str2;
6714 PyObject *result;
6715
6716 self = PyUnicode_FromObject(obj);
6717 if (self == NULL)
6718 return NULL;
6719 str1 = PyUnicode_FromObject(subobj);
6720 if (str1 == NULL) {
6721 Py_DECREF(self);
6722 return NULL;
6723 }
6724 str2 = PyUnicode_FromObject(replobj);
6725 if (str2 == NULL) {
6726 Py_DECREF(self);
6727 Py_DECREF(str1);
6728 return NULL;
6729 }
Tim Petersced69f82003-09-16 20:30:58 +00006730 result = replace((PyUnicodeObject *)self,
6731 (PyUnicodeObject *)str1,
6732 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 maxcount);
6734 Py_DECREF(self);
6735 Py_DECREF(str1);
6736 Py_DECREF(str2);
6737 return result;
6738}
6739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006740PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741"S.replace (old, new[, maxsplit]) -> unicode\n\
6742\n\
6743Return a copy of S with all occurrences of substring\n\
6744old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006745given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746
6747static PyObject*
6748unicode_replace(PyUnicodeObject *self, PyObject *args)
6749{
6750 PyUnicodeObject *str1;
6751 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006752 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 PyObject *result;
6754
Martin v. Löwis18e16552006-02-15 17:27:45 +00006755 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 return NULL;
6757 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6758 if (str1 == NULL)
6759 return NULL;
6760 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006761 if (str2 == NULL) {
6762 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765
6766 result = replace(self, str1, str2, maxcount);
6767
6768 Py_DECREF(str1);
6769 Py_DECREF(str2);
6770 return result;
6771}
6772
6773static
6774PyObject *unicode_repr(PyObject *unicode)
6775{
6776 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6777 PyUnicode_GET_SIZE(unicode),
6778 1);
6779}
6780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006781PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782"S.rfind(sub [,start [,end]]) -> int\n\
6783\n\
6784Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006785such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786arguments start and end are interpreted as in slice notation.\n\
6787\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006788Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789
6790static PyObject *
6791unicode_rfind(PyUnicodeObject *self, PyObject *args)
6792{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006793 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006794 Py_ssize_t start;
6795 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006796 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797
Facundo Batista57d56692007-11-16 18:04:14 +00006798 if (!_ParseTupleFinds(args, &substring, &start, &end))
6799 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006801 result = stringlib_rfind_slice(
6802 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6803 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6804 start, end
6805 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806
6807 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006808
6809 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810}
6811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006812PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813"S.rindex(sub [,start [,end]]) -> int\n\
6814\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006815Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816
6817static PyObject *
6818unicode_rindex(PyUnicodeObject *self, PyObject *args)
6819{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006820 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006821 Py_ssize_t start;
6822 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006823 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824
Facundo Batista57d56692007-11-16 18:04:14 +00006825 if (!_ParseTupleFinds(args, &substring, &start, &end))
6826 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006828 result = stringlib_rfind_slice(
6829 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6830 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6831 start, end
6832 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833
6834 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006835
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 if (result < 0) {
6837 PyErr_SetString(PyExc_ValueError, "substring not found");
6838 return NULL;
6839 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006840 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841}
6842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006843PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006844"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845\n\
6846Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006847done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848
6849static PyObject *
6850unicode_rjust(PyUnicodeObject *self, PyObject *args)
6851{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006852 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006853 Py_UNICODE fillchar = ' ';
6854
Martin v. Löwis412fb672006-04-13 06:34:32 +00006855 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 return NULL;
6857
Tim Peters7a29bd52001-09-12 03:03:31 +00006858 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 Py_INCREF(self);
6860 return (PyObject*) self;
6861 }
6862
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006863 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864}
6865
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006867unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868{
6869 /* standard clamping */
6870 if (start < 0)
6871 start = 0;
6872 if (end < 0)
6873 end = 0;
6874 if (end > self->length)
6875 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006876 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 /* full slice, return original string */
6878 Py_INCREF(self);
6879 return (PyObject*) self;
6880 }
6881 if (start > end)
6882 start = end;
6883 /* copy slice */
6884 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6885 end - start);
6886}
6887
6888PyObject *PyUnicode_Split(PyObject *s,
6889 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006890 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891{
6892 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006893
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894 s = PyUnicode_FromObject(s);
6895 if (s == NULL)
6896 return NULL;
6897 if (sep != NULL) {
6898 sep = PyUnicode_FromObject(sep);
6899 if (sep == NULL) {
6900 Py_DECREF(s);
6901 return NULL;
6902 }
6903 }
6904
6905 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6906
6907 Py_DECREF(s);
6908 Py_XDECREF(sep);
6909 return result;
6910}
6911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006912PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913"S.split([sep [,maxsplit]]) -> list of strings\n\
6914\n\
6915Return a list of the words in S, using sep as the\n\
6916delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006917splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006918any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919
6920static PyObject*
6921unicode_split(PyUnicodeObject *self, PyObject *args)
6922{
6923 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006924 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925
Martin v. Löwis18e16552006-02-15 17:27:45 +00006926 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 return NULL;
6928
6929 if (substring == Py_None)
6930 return split(self, NULL, maxcount);
6931 else if (PyUnicode_Check(substring))
6932 return split(self, (PyUnicodeObject *)substring, maxcount);
6933 else
6934 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6935}
6936
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006937PyObject *
6938PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6939{
6940 PyObject* str_obj;
6941 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006942 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006943
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006944 str_obj = PyUnicode_FromObject(str_in);
6945 if (!str_obj)
6946 return NULL;
6947 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006948 if (!sep_obj) {
6949 Py_DECREF(str_obj);
6950 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006951 }
6952
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006953 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006954 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6955 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6956 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006957
Fredrik Lundhb9479482006-05-26 17:22:38 +00006958 Py_DECREF(sep_obj);
6959 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006960
6961 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006962}
6963
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006964
6965PyObject *
6966PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6967{
6968 PyObject* str_obj;
6969 PyObject* sep_obj;
6970 PyObject* out;
6971
6972 str_obj = PyUnicode_FromObject(str_in);
6973 if (!str_obj)
6974 return NULL;
6975 sep_obj = PyUnicode_FromObject(sep_in);
6976 if (!sep_obj) {
6977 Py_DECREF(str_obj);
6978 return NULL;
6979 }
6980
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006981 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006982 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6983 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6984 );
6985
6986 Py_DECREF(sep_obj);
6987 Py_DECREF(str_obj);
6988
6989 return out;
6990}
6991
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006992PyDoc_STRVAR(partition__doc__,
6993"S.partition(sep) -> (head, sep, tail)\n\
6994\n\
6995Searches for the separator sep in S, and returns the part before it,\n\
6996the separator itself, and the part after it. If the separator is not\n\
6997found, returns S and two empty strings.");
6998
6999static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007000unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007001{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007002 return PyUnicode_Partition((PyObject *)self, separator);
7003}
7004
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007005PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007006"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007007\n\
7008Searches for the separator sep in S, starting at the end of S, and returns\n\
7009the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007010separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007011
7012static PyObject*
7013unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7014{
7015 return PyUnicode_RPartition((PyObject *)self, separator);
7016}
7017
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007018PyObject *PyUnicode_RSplit(PyObject *s,
7019 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007020 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007021{
7022 PyObject *result;
7023
7024 s = PyUnicode_FromObject(s);
7025 if (s == NULL)
7026 return NULL;
7027 if (sep != NULL) {
7028 sep = PyUnicode_FromObject(sep);
7029 if (sep == NULL) {
7030 Py_DECREF(s);
7031 return NULL;
7032 }
7033 }
7034
7035 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7036
7037 Py_DECREF(s);
7038 Py_XDECREF(sep);
7039 return result;
7040}
7041
7042PyDoc_STRVAR(rsplit__doc__,
7043"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7044\n\
7045Return a list of the words in S, using sep as the\n\
7046delimiter string, starting at the end of the string and\n\
7047working to the front. If maxsplit is given, at most maxsplit\n\
7048splits are done. If sep is not specified, any whitespace string\n\
7049is a separator.");
7050
7051static PyObject*
7052unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7053{
7054 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007055 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007056
Martin v. Löwis18e16552006-02-15 17:27:45 +00007057 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007058 return NULL;
7059
7060 if (substring == Py_None)
7061 return rsplit(self, NULL, maxcount);
7062 else if (PyUnicode_Check(substring))
7063 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7064 else
7065 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7066}
7067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007068PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007069"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070\n\
7071Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007072Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007073is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074
7075static PyObject*
7076unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7077{
Guido van Rossum86662912000-04-11 15:38:46 +00007078 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079
Guido van Rossum86662912000-04-11 15:38:46 +00007080 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081 return NULL;
7082
Guido van Rossum86662912000-04-11 15:38:46 +00007083 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084}
7085
7086static
7087PyObject *unicode_str(PyUnicodeObject *self)
7088{
Fred Drakee4315f52000-05-09 19:53:39 +00007089 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090}
7091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007092PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093"S.swapcase() -> unicode\n\
7094\n\
7095Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007096and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097
7098static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007099unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101 return fixup(self, fixswapcase);
7102}
7103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007104PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105"S.translate(table) -> unicode\n\
7106\n\
7107Return a copy of the string S, where all characters have been mapped\n\
7108through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007109Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7110Unmapped characters are left untouched. Characters mapped to None\n\
7111are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112
7113static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007114unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115{
Tim Petersced69f82003-09-16 20:30:58 +00007116 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007118 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119 "ignore");
7120}
7121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007122PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123"S.upper() -> unicode\n\
7124\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007125Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126
7127static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007128unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 return fixup(self, fixupper);
7131}
7132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007133PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134"S.zfill(width) -> unicode\n\
7135\n\
7136Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007137of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138
7139static PyObject *
7140unicode_zfill(PyUnicodeObject *self, PyObject *args)
7141{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007142 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 PyUnicodeObject *u;
7144
Martin v. Löwis18e16552006-02-15 17:27:45 +00007145 Py_ssize_t width;
7146 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 return NULL;
7148
7149 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007150 if (PyUnicode_CheckExact(self)) {
7151 Py_INCREF(self);
7152 return (PyObject*) self;
7153 }
7154 else
7155 return PyUnicode_FromUnicode(
7156 PyUnicode_AS_UNICODE(self),
7157 PyUnicode_GET_SIZE(self)
7158 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 }
7160
7161 fill = width - self->length;
7162
7163 u = pad(self, fill, 0, '0');
7164
Walter Dörwald068325e2002-04-15 13:36:47 +00007165 if (u == NULL)
7166 return NULL;
7167
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 if (u->str[fill] == '+' || u->str[fill] == '-') {
7169 /* move sign to beginning of string */
7170 u->str[0] = u->str[fill];
7171 u->str[fill] = '0';
7172 }
7173
7174 return (PyObject*) u;
7175}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176
7177#if 0
7178static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007179unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181 return PyInt_FromLong(unicode_freelist_size);
7182}
7183#endif
7184
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007185PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007186"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007188Return True if S starts with the specified prefix, False otherwise.\n\
7189With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007190With optional end, stop comparing S at that position.\n\
7191prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192
7193static PyObject *
7194unicode_startswith(PyUnicodeObject *self,
7195 PyObject *args)
7196{
Georg Brandl24250812006-06-09 18:45:48 +00007197 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007199 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007200 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007201 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202
Georg Brandl24250812006-06-09 18:45:48 +00007203 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007204 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007206 if (PyTuple_Check(subobj)) {
7207 Py_ssize_t i;
7208 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7209 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7210 PyTuple_GET_ITEM(subobj, i));
7211 if (substring == NULL)
7212 return NULL;
7213 result = tailmatch(self, substring, start, end, -1);
7214 Py_DECREF(substring);
7215 if (result) {
7216 Py_RETURN_TRUE;
7217 }
7218 }
7219 /* nothing matched */
7220 Py_RETURN_FALSE;
7221 }
7222 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007224 return NULL;
7225 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007227 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228}
7229
7230
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007231PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007232"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007234Return True if S ends with the specified suffix, False otherwise.\n\
7235With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007236With optional end, stop comparing S at that position.\n\
7237suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238
7239static PyObject *
7240unicode_endswith(PyUnicodeObject *self,
7241 PyObject *args)
7242{
Georg Brandl24250812006-06-09 18:45:48 +00007243 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007245 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007246 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007247 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248
Georg Brandl24250812006-06-09 18:45:48 +00007249 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7250 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007252 if (PyTuple_Check(subobj)) {
7253 Py_ssize_t i;
7254 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7255 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7256 PyTuple_GET_ITEM(subobj, i));
7257 if (substring == NULL)
7258 return NULL;
7259 result = tailmatch(self, substring, start, end, +1);
7260 Py_DECREF(substring);
7261 if (result) {
7262 Py_RETURN_TRUE;
7263 }
7264 }
7265 Py_RETURN_FALSE;
7266 }
7267 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007269 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270
Georg Brandl24250812006-06-09 18:45:48 +00007271 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007273 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274}
7275
7276
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007277
7278static PyObject *
7279unicode_getnewargs(PyUnicodeObject *v)
7280{
7281 return Py_BuildValue("(u#)", v->str, v->length);
7282}
7283
7284
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285static PyMethodDef unicode_methods[] = {
7286
7287 /* Order is according to common usage: often used methods should
7288 appear first, since lookup is done sequentially. */
7289
Georg Brandlecdc0a92006-03-30 12:19:07 +00007290 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007291 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7292 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007293 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007294 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7295 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7296 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7297 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7298 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7299 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7300 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007301 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007302 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7303 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7304 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007305 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007306 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007307/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7308 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7309 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7310 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007311 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007312 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007313 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007314 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007315 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7316 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7317 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7318 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7319 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7320 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7321 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7322 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7323 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7324 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7325 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7326 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7327 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7328 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007329 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007330#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007331 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332#endif
7333
7334#if 0
7335 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007336 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337#endif
7338
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007339 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340 {NULL, NULL}
7341};
7342
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007343static PyObject *
7344unicode_mod(PyObject *v, PyObject *w)
7345{
7346 if (!PyUnicode_Check(v)) {
7347 Py_INCREF(Py_NotImplemented);
7348 return Py_NotImplemented;
7349 }
7350 return PyUnicode_Format(v, w);
7351}
7352
7353static PyNumberMethods unicode_as_number = {
7354 0, /*nb_add*/
7355 0, /*nb_subtract*/
7356 0, /*nb_multiply*/
7357 0, /*nb_divide*/
7358 unicode_mod, /*nb_remainder*/
7359};
7360
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007362 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007363 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007364 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7365 (ssizeargfunc) unicode_getitem, /* sq_item */
7366 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367 0, /* sq_ass_item */
7368 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007369 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370};
7371
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007372static PyObject*
7373unicode_subscript(PyUnicodeObject* self, PyObject* item)
7374{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007375 if (PyIndex_Check(item)) {
7376 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007377 if (i == -1 && PyErr_Occurred())
7378 return NULL;
7379 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007380 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007381 return unicode_getitem(self, i);
7382 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007383 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007384 Py_UNICODE* source_buf;
7385 Py_UNICODE* result_buf;
7386 PyObject* result;
7387
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007388 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007389 &start, &stop, &step, &slicelength) < 0) {
7390 return NULL;
7391 }
7392
7393 if (slicelength <= 0) {
7394 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007395 } else if (start == 0 && step == 1 && slicelength == self->length &&
7396 PyUnicode_CheckExact(self)) {
7397 Py_INCREF(self);
7398 return (PyObject *)self;
7399 } else if (step == 1) {
7400 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007401 } else {
7402 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00007403 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7404 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007405
7406 if (result_buf == NULL)
7407 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007408
7409 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7410 result_buf[i] = source_buf[cur];
7411 }
Tim Petersced69f82003-09-16 20:30:58 +00007412
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007413 result = PyUnicode_FromUnicode(result_buf, slicelength);
7414 PyMem_FREE(result_buf);
7415 return result;
7416 }
7417 } else {
7418 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7419 return NULL;
7420 }
7421}
7422
7423static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007424 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007425 (binaryfunc)unicode_subscript, /* mp_subscript */
7426 (objobjargproc)0, /* mp_ass_subscript */
7427};
7428
Martin v. Löwis18e16552006-02-15 17:27:45 +00007429static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007431 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432 const void **ptr)
7433{
7434 if (index != 0) {
7435 PyErr_SetString(PyExc_SystemError,
7436 "accessing non-existent unicode segment");
7437 return -1;
7438 }
7439 *ptr = (void *) self->str;
7440 return PyUnicode_GET_DATA_SIZE(self);
7441}
7442
Martin v. Löwis18e16552006-02-15 17:27:45 +00007443static Py_ssize_t
7444unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 const void **ptr)
7446{
7447 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007448 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 return -1;
7450}
7451
7452static int
7453unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007454 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455{
7456 if (lenp)
7457 *lenp = PyUnicode_GET_DATA_SIZE(self);
7458 return 1;
7459}
7460
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007461static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007463 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464 const void **ptr)
7465{
7466 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007467
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468 if (index != 0) {
7469 PyErr_SetString(PyExc_SystemError,
7470 "accessing non-existent unicode segment");
7471 return -1;
7472 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007473 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 if (str == NULL)
7475 return -1;
7476 *ptr = (void *) PyString_AS_STRING(str);
7477 return PyString_GET_SIZE(str);
7478}
7479
7480/* Helpers for PyUnicode_Format() */
7481
7482static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007483getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007485 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486 if (argidx < arglen) {
7487 (*p_argidx)++;
7488 if (arglen < 0)
7489 return args;
7490 else
7491 return PyTuple_GetItem(args, argidx);
7492 }
7493 PyErr_SetString(PyExc_TypeError,
7494 "not enough arguments for format string");
7495 return NULL;
7496}
7497
7498#define F_LJUST (1<<0)
7499#define F_SIGN (1<<1)
7500#define F_BLANK (1<<2)
7501#define F_ALT (1<<3)
7502#define F_ZERO (1<<4)
7503
Martin v. Löwis18e16552006-02-15 17:27:45 +00007504static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007505strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007507 register Py_ssize_t i;
7508 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509 for (i = len - 1; i >= 0; i--)
7510 buffer[i] = (Py_UNICODE) charbuffer[i];
7511
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 return len;
7513}
7514
Neal Norwitzfc76d632006-01-10 06:03:13 +00007515static int
7516doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7517{
Tim Peters15231542006-02-16 01:08:01 +00007518 Py_ssize_t result;
7519
Neal Norwitzfc76d632006-01-10 06:03:13 +00007520 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007521 result = strtounicode(buffer, (char *)buffer);
7522 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007523}
7524
7525static int
7526longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7527{
Tim Peters15231542006-02-16 01:08:01 +00007528 Py_ssize_t result;
7529
Neal Norwitzfc76d632006-01-10 06:03:13 +00007530 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007531 result = strtounicode(buffer, (char *)buffer);
7532 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007533}
7534
Guido van Rossum078151d2002-08-11 04:24:12 +00007535/* XXX To save some code duplication, formatfloat/long/int could have been
7536 shared with stringobject.c, converting from 8-bit to Unicode after the
7537 formatting is done. */
7538
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539static int
7540formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007541 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542 int flags,
7543 int prec,
7544 int type,
7545 PyObject *v)
7546{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007547 /* fmt = '%#.' + `prec` + `type`
7548 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 char fmt[20];
7550 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007551
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 x = PyFloat_AsDouble(v);
7553 if (x == -1.0 && PyErr_Occurred())
7554 return -1;
7555 if (prec < 0)
7556 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7558 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007559 /* Worst case length calc to ensure no buffer overrun:
7560
7561 'g' formats:
7562 fmt = %#.<prec>g
7563 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7564 for any double rep.)
7565 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7566
7567 'f' formats:
7568 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7569 len = 1 + 50 + 1 + prec = 52 + prec
7570
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007571 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007572 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007573
7574 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00007575 if (((type == 'g' || type == 'G') &&
7576 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007577 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007578 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007579 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007580 return -1;
7581 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007582 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7583 (flags&F_ALT) ? "#" : "",
7584 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007585 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586}
7587
Tim Peters38fd5b62000-09-21 05:43:11 +00007588static PyObject*
7589formatlong(PyObject *val, int flags, int prec, int type)
7590{
7591 char *buf;
7592 int i, len;
7593 PyObject *str; /* temporary string object. */
7594 PyUnicodeObject *result;
7595
7596 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7597 if (!str)
7598 return NULL;
7599 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007600 if (!result) {
7601 Py_DECREF(str);
7602 return NULL;
7603 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007604 for (i = 0; i < len; i++)
7605 result->str[i] = buf[i];
7606 result->str[len] = 0;
7607 Py_DECREF(str);
7608 return (PyObject*)result;
7609}
7610
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611static int
7612formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007613 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614 int flags,
7615 int prec,
7616 int type,
7617 PyObject *v)
7618{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007619 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007620 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7621 * + 1 + 1
7622 * = 24
7623 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007624 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007625 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626 long x;
7627
7628 x = PyInt_AsLong(v);
7629 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007630 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007631 if (x < 0 && type == 'u') {
7632 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007633 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007634 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7635 sign = "-";
7636 else
7637 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007639 prec = 1;
7640
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007641 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7642 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007643 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007644 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007645 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007646 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007647 return -1;
7648 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007649
7650 if ((flags & F_ALT) &&
7651 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007652 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007653 * of issues that cause pain:
7654 * - when 0 is being converted, the C standard leaves off
7655 * the '0x' or '0X', which is inconsistent with other
7656 * %#x/%#X conversions and inconsistent with Python's
7657 * hex() function
7658 * - there are platforms that violate the standard and
7659 * convert 0 with the '0x' or '0X'
7660 * (Metrowerks, Compaq Tru64)
7661 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007662 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007663 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007664 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007665 * We can achieve the desired consistency by inserting our
7666 * own '0x' or '0X' prefix, and substituting %x/%X in place
7667 * of %#x/%#X.
7668 *
7669 * Note that this is the same approach as used in
7670 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007671 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007672 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7673 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007674 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007675 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007676 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7677 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007678 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007679 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007680 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007681 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007682 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007683 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684}
7685
7686static int
7687formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007688 size_t buflen,
7689 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007691 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007692 if (PyUnicode_Check(v)) {
7693 if (PyUnicode_GET_SIZE(v) != 1)
7694 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007698 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007699 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007700 goto onError;
7701 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703
7704 else {
7705 /* Integer input truncated to a character */
7706 long x;
7707 x = PyInt_AsLong(v);
7708 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007709 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007710#ifdef Py_UNICODE_WIDE
7711 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007712 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007713 "%c arg not in range(0x110000) "
7714 "(wide Python build)");
7715 return -1;
7716 }
7717#else
7718 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007719 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007720 "%c arg not in range(0x10000) "
7721 "(narrow Python build)");
7722 return -1;
7723 }
7724#endif
7725 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726 }
7727 buf[1] = '\0';
7728 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007729
7730 onError:
7731 PyErr_SetString(PyExc_TypeError,
7732 "%c requires int or char");
7733 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734}
7735
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007736/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7737
7738 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7739 chars are formatted. XXX This is a magic number. Each formatting
7740 routine does bounds checking to ensure no overflow, but a better
7741 solution may be to malloc a buffer of appropriate size for each
7742 format. For now, the current solution is sufficient.
7743*/
7744#define FORMATBUFLEN (size_t)120
7745
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746PyObject *PyUnicode_Format(PyObject *format,
7747 PyObject *args)
7748{
7749 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007750 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 int args_owned = 0;
7752 PyUnicodeObject *result = NULL;
7753 PyObject *dict = NULL;
7754 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007755
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 if (format == NULL || args == NULL) {
7757 PyErr_BadInternalCall();
7758 return NULL;
7759 }
7760 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007761 if (uformat == NULL)
7762 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 fmt = PyUnicode_AS_UNICODE(uformat);
7764 fmtcnt = PyUnicode_GET_SIZE(uformat);
7765
7766 reslen = rescnt = fmtcnt + 100;
7767 result = _PyUnicode_New(reslen);
7768 if (result == NULL)
7769 goto onError;
7770 res = PyUnicode_AS_UNICODE(result);
7771
7772 if (PyTuple_Check(args)) {
7773 arglen = PyTuple_Size(args);
7774 argidx = 0;
7775 }
7776 else {
7777 arglen = -1;
7778 argidx = -2;
7779 }
Martin v. Löwis68192102007-07-21 06:55:02 +00007780 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007781 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782 dict = args;
7783
7784 while (--fmtcnt >= 0) {
7785 if (*fmt != '%') {
7786 if (--rescnt < 0) {
7787 rescnt = fmtcnt + 100;
7788 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007789 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007790 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7792 --rescnt;
7793 }
7794 *res++ = *fmt++;
7795 }
7796 else {
7797 /* Got a format specifier */
7798 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007799 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 Py_UNICODE c = '\0';
7802 Py_UNICODE fill;
7803 PyObject *v = NULL;
7804 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007805 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007807 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007808 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809
7810 fmt++;
7811 if (*fmt == '(') {
7812 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007813 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814 PyObject *key;
7815 int pcount = 1;
7816
7817 if (dict == NULL) {
7818 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007819 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820 goto onError;
7821 }
7822 ++fmt;
7823 --fmtcnt;
7824 keystart = fmt;
7825 /* Skip over balanced parentheses */
7826 while (pcount > 0 && --fmtcnt >= 0) {
7827 if (*fmt == ')')
7828 --pcount;
7829 else if (*fmt == '(')
7830 ++pcount;
7831 fmt++;
7832 }
7833 keylen = fmt - keystart - 1;
7834 if (fmtcnt < 0 || pcount > 0) {
7835 PyErr_SetString(PyExc_ValueError,
7836 "incomplete format key");
7837 goto onError;
7838 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007839#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007840 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 then looked up since Python uses strings to hold
7842 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007843 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 key = PyUnicode_EncodeUTF8(keystart,
7845 keylen,
7846 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007847#else
7848 key = PyUnicode_FromUnicode(keystart, keylen);
7849#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850 if (key == NULL)
7851 goto onError;
7852 if (args_owned) {
7853 Py_DECREF(args);
7854 args_owned = 0;
7855 }
7856 args = PyObject_GetItem(dict, key);
7857 Py_DECREF(key);
7858 if (args == NULL) {
7859 goto onError;
7860 }
7861 args_owned = 1;
7862 arglen = -1;
7863 argidx = -2;
7864 }
7865 while (--fmtcnt >= 0) {
7866 switch (c = *fmt++) {
7867 case '-': flags |= F_LJUST; continue;
7868 case '+': flags |= F_SIGN; continue;
7869 case ' ': flags |= F_BLANK; continue;
7870 case '#': flags |= F_ALT; continue;
7871 case '0': flags |= F_ZERO; continue;
7872 }
7873 break;
7874 }
7875 if (c == '*') {
7876 v = getnextarg(args, arglen, &argidx);
7877 if (v == NULL)
7878 goto onError;
7879 if (!PyInt_Check(v)) {
7880 PyErr_SetString(PyExc_TypeError,
7881 "* wants int");
7882 goto onError;
7883 }
7884 width = PyInt_AsLong(v);
7885 if (width < 0) {
7886 flags |= F_LJUST;
7887 width = -width;
7888 }
7889 if (--fmtcnt >= 0)
7890 c = *fmt++;
7891 }
7892 else if (c >= '0' && c <= '9') {
7893 width = c - '0';
7894 while (--fmtcnt >= 0) {
7895 c = *fmt++;
7896 if (c < '0' || c > '9')
7897 break;
7898 if ((width*10) / 10 != width) {
7899 PyErr_SetString(PyExc_ValueError,
7900 "width too big");
7901 goto onError;
7902 }
7903 width = width*10 + (c - '0');
7904 }
7905 }
7906 if (c == '.') {
7907 prec = 0;
7908 if (--fmtcnt >= 0)
7909 c = *fmt++;
7910 if (c == '*') {
7911 v = getnextarg(args, arglen, &argidx);
7912 if (v == NULL)
7913 goto onError;
7914 if (!PyInt_Check(v)) {
7915 PyErr_SetString(PyExc_TypeError,
7916 "* wants int");
7917 goto onError;
7918 }
7919 prec = PyInt_AsLong(v);
7920 if (prec < 0)
7921 prec = 0;
7922 if (--fmtcnt >= 0)
7923 c = *fmt++;
7924 }
7925 else if (c >= '0' && c <= '9') {
7926 prec = c - '0';
7927 while (--fmtcnt >= 0) {
7928 c = Py_CHARMASK(*fmt++);
7929 if (c < '0' || c > '9')
7930 break;
7931 if ((prec*10) / 10 != prec) {
7932 PyErr_SetString(PyExc_ValueError,
7933 "prec too big");
7934 goto onError;
7935 }
7936 prec = prec*10 + (c - '0');
7937 }
7938 }
7939 } /* prec */
7940 if (fmtcnt >= 0) {
7941 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942 if (--fmtcnt >= 0)
7943 c = *fmt++;
7944 }
7945 }
7946 if (fmtcnt < 0) {
7947 PyErr_SetString(PyExc_ValueError,
7948 "incomplete format");
7949 goto onError;
7950 }
7951 if (c != '%') {
7952 v = getnextarg(args, arglen, &argidx);
7953 if (v == NULL)
7954 goto onError;
7955 }
7956 sign = 0;
7957 fill = ' ';
7958 switch (c) {
7959
7960 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007961 pbuf = formatbuf;
7962 /* presume that buffer length is at least 1 */
7963 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964 len = 1;
7965 break;
7966
7967 case 's':
7968 case 'r':
7969 if (PyUnicode_Check(v) && c == 's') {
7970 temp = v;
7971 Py_INCREF(temp);
7972 }
7973 else {
7974 PyObject *unicode;
7975 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007976 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977 else
7978 temp = PyObject_Repr(v);
7979 if (temp == NULL)
7980 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007981 if (PyUnicode_Check(temp))
7982 /* nothing to do */;
7983 else if (PyString_Check(temp)) {
7984 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007985 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007987 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007989 Py_DECREF(temp);
7990 temp = unicode;
7991 if (temp == NULL)
7992 goto onError;
7993 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007994 else {
7995 Py_DECREF(temp);
7996 PyErr_SetString(PyExc_TypeError,
7997 "%s argument has non-string str()");
7998 goto onError;
7999 }
8000 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008001 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 len = PyUnicode_GET_SIZE(temp);
8003 if (prec >= 0 && len > prec)
8004 len = prec;
8005 break;
8006
8007 case 'i':
8008 case 'd':
8009 case 'u':
8010 case 'o':
8011 case 'x':
8012 case 'X':
8013 if (c == 'i')
8014 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008015 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008016 temp = formatlong(v, flags, prec, c);
8017 if (!temp)
8018 goto onError;
8019 pbuf = PyUnicode_AS_UNICODE(temp);
8020 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008021 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008023 else {
8024 pbuf = formatbuf;
8025 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8026 flags, prec, c, v);
8027 if (len < 0)
8028 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008029 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008030 }
8031 if (flags & F_ZERO)
8032 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 break;
8034
8035 case 'e':
8036 case 'E':
8037 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008038 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039 case 'g':
8040 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008041 if (c == 'F')
8042 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008043 pbuf = formatbuf;
8044 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8045 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046 if (len < 0)
8047 goto onError;
8048 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008049 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 fill = '0';
8051 break;
8052
8053 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008054 pbuf = formatbuf;
8055 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056 if (len < 0)
8057 goto onError;
8058 break;
8059
8060 default:
8061 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008062 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008063 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008064 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008065 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008066 (Py_ssize_t)(fmt - 1 -
8067 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068 goto onError;
8069 }
8070 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008071 if (*pbuf == '-' || *pbuf == '+') {
8072 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073 len--;
8074 }
8075 else if (flags & F_SIGN)
8076 sign = '+';
8077 else if (flags & F_BLANK)
8078 sign = ' ';
8079 else
8080 sign = 0;
8081 }
8082 if (width < len)
8083 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008084 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 reslen -= rescnt;
8086 rescnt = width + fmtcnt + 100;
8087 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008088 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008089 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008090 PyErr_NoMemory();
8091 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008092 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008093 if (_PyUnicode_Resize(&result, reslen) < 0) {
8094 Py_XDECREF(temp);
8095 goto onError;
8096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 res = PyUnicode_AS_UNICODE(result)
8098 + reslen - rescnt;
8099 }
8100 if (sign) {
8101 if (fill != ' ')
8102 *res++ = sign;
8103 rescnt--;
8104 if (width > len)
8105 width--;
8106 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008107 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8108 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008109 assert(pbuf[1] == c);
8110 if (fill != ' ') {
8111 *res++ = *pbuf++;
8112 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008113 }
Tim Petersfff53252001-04-12 18:38:48 +00008114 rescnt -= 2;
8115 width -= 2;
8116 if (width < 0)
8117 width = 0;
8118 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008119 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120 if (width > len && !(flags & F_LJUST)) {
8121 do {
8122 --rescnt;
8123 *res++ = fill;
8124 } while (--width > len);
8125 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008126 if (fill == ' ') {
8127 if (sign)
8128 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008129 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008130 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008131 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008132 *res++ = *pbuf++;
8133 *res++ = *pbuf++;
8134 }
8135 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008136 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 res += len;
8138 rescnt -= len;
8139 while (--width >= len) {
8140 --rescnt;
8141 *res++ = ' ';
8142 }
8143 if (dict && (argidx < arglen) && c != '%') {
8144 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008145 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008146 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 goto onError;
8148 }
8149 Py_XDECREF(temp);
8150 } /* '%' */
8151 } /* until end */
8152 if (argidx < arglen && !dict) {
8153 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008154 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155 goto onError;
8156 }
8157
Thomas Woutersa96affe2006-03-12 00:29:36 +00008158 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8159 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 if (args_owned) {
8161 Py_DECREF(args);
8162 }
8163 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164 return (PyObject *)result;
8165
8166 onError:
8167 Py_XDECREF(result);
8168 Py_DECREF(uformat);
8169 if (args_owned) {
8170 Py_DECREF(args);
8171 }
8172 return NULL;
8173}
8174
8175static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008176 (readbufferproc) unicode_buffer_getreadbuf,
8177 (writebufferproc) unicode_buffer_getwritebuf,
8178 (segcountproc) unicode_buffer_getsegcount,
8179 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180};
8181
Jeremy Hylton938ace62002-07-17 16:30:39 +00008182static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008183unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8184
Tim Peters6d6c1a32001-08-02 04:15:00 +00008185static PyObject *
8186unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8187{
8188 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008189 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008190 char *encoding = NULL;
8191 char *errors = NULL;
8192
Guido van Rossume023fe02001-08-30 03:12:59 +00008193 if (type != &PyUnicode_Type)
8194 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008195 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8196 kwlist, &x, &encoding, &errors))
8197 return NULL;
8198 if (x == NULL)
8199 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008200 if (encoding == NULL && errors == NULL)
8201 return PyObject_Unicode(x);
8202 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008203 return PyUnicode_FromEncodedObject(x, encoding, errors);
8204}
8205
Guido van Rossume023fe02001-08-30 03:12:59 +00008206static PyObject *
8207unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8208{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008209 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008210 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008211
8212 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8213 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8214 if (tmp == NULL)
8215 return NULL;
8216 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008217 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008218 if (pnew == NULL) {
8219 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008220 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008221 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008222 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8223 if (pnew->str == NULL) {
8224 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008225 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008226 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008227 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008228 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008229 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8230 pnew->length = n;
8231 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008232 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008233 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008234}
8235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008236PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008237"unicode(string [, encoding[, errors]]) -> object\n\
8238\n\
8239Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008240encoding defaults to the current default string encoding.\n\
8241errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008242
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008244 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 "unicode", /* tp_name */
8246 sizeof(PyUnicodeObject), /* tp_size */
8247 0, /* tp_itemsize */
8248 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008249 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008251 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008253 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008254 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008255 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008257 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 (hashfunc) unicode_hash, /* tp_hash*/
8259 0, /* tp_call*/
8260 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008261 PyObject_GenericGetAttr, /* tp_getattro */
8262 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008264 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008265 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008266 unicode_doc, /* tp_doc */
8267 0, /* tp_traverse */
8268 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008269 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008270 0, /* tp_weaklistoffset */
8271 0, /* tp_iter */
8272 0, /* tp_iternext */
8273 unicode_methods, /* tp_methods */
8274 0, /* tp_members */
8275 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008276 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008277 0, /* tp_dict */
8278 0, /* tp_descr_get */
8279 0, /* tp_descr_set */
8280 0, /* tp_dictoffset */
8281 0, /* tp_init */
8282 0, /* tp_alloc */
8283 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008284 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285};
8286
8287/* Initialize the Unicode implementation */
8288
Thomas Wouters78890102000-07-22 19:25:51 +00008289void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008291 int i;
8292
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008293 /* XXX - move this array to unicodectype.c ? */
8294 Py_UNICODE linebreak[] = {
8295 0x000A, /* LINE FEED */
8296 0x000D, /* CARRIAGE RETURN */
8297 0x001C, /* FILE SEPARATOR */
8298 0x001D, /* GROUP SEPARATOR */
8299 0x001E, /* RECORD SEPARATOR */
8300 0x0085, /* NEXT LINE */
8301 0x2028, /* LINE SEPARATOR */
8302 0x2029, /* PARAGRAPH SEPARATOR */
8303 };
8304
Fred Drakee4315f52000-05-09 19:53:39 +00008305 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008306 unicode_freelist = NULL;
8307 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008309 if (!unicode_empty)
8310 return;
8311
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008312 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008313 for (i = 0; i < 256; i++)
8314 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008315 if (PyType_Ready(&PyUnicode_Type) < 0)
8316 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008317
8318 /* initialize the linebreak bloom filter */
8319 bloom_linebreak = make_bloom_mask(
8320 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8321 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008322
8323 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324}
8325
8326/* Finalize the Unicode implementation */
8327
8328void
Thomas Wouters78890102000-07-22 19:25:51 +00008329_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008331 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008332 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008334 Py_XDECREF(unicode_empty);
8335 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008336
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008337 for (i = 0; i < 256; i++) {
8338 if (unicode_latin1[i]) {
8339 Py_DECREF(unicode_latin1[i]);
8340 unicode_latin1[i] = NULL;
8341 }
8342 }
8343
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008344 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345 PyUnicodeObject *v = u;
8346 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008347 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008348 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008349 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008350 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008352 unicode_freelist = NULL;
8353 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008355
Anthony Baxterac6bd462006-04-13 02:06:09 +00008356#ifdef __cplusplus
8357}
8358#endif
8359
8360
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008361/*
8362Local variables:
8363c-basic-offset: 4
8364indent-tabs-mode: nil
8365End:
8366*/