blob: c749ac51a7e5052acd19ddbe8a3b6b59f5124480 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
Neal Norwitzb3635f92008-03-18 04:17:36 +0000203 unicode->str = PyObject_REALLOC(unicode->str,
204 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000206 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 PyErr_NoMemory();
208 return -1;
209 }
210 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000211 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000213 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000215 if (unicode->defenc) {
216 Py_DECREF(unicode->defenc);
217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 }
219 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000220
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 return 0;
222}
223
224/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000225 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226
227 XXX This allocator could further be enhanced by assuring that the
228 free list never reduces its size below 1.
229
230*/
231
232static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000233PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234{
235 register PyUnicodeObject *unicode;
236
Andrew Dalkee0df7622006-05-27 11:04:36 +0000237 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238 if (length == 0 && unicode_empty != NULL) {
239 Py_INCREF(unicode_empty);
240 return unicode_empty;
241 }
242
243 /* Unicode freelist & memory allocation */
244 if (unicode_freelist) {
245 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000246 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000248 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000249 /* Keep-Alive optimization: we only upsize the buffer,
250 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000251 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000252 unicode_resize(unicode, length) < 0) {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000253 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000254 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 }
256 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000257 else {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000258 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
259 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000260 }
261 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 }
263 else {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000264 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000265 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode == NULL)
267 return NULL;
Neal Norwitzb3635f92008-03-18 04:17:36 +0000268 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
269 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 }
271
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000272 if (!unicode->str) {
273 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000274 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000275 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000276 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000277 * the caller fails before initializing str -- unicode_resize()
278 * reads str[0], and the Keep-Alive optimization can keep memory
279 * allocated for str alive across a call to unicode_dealloc(unicode).
280 * We don't want unicode_resize to read uninitialized memory in
281 * that case.
282 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000283 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000285 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289
290 onError:
291 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000292 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294}
295
296static
Guido van Rossum9475a232001-10-05 20:51:39 +0000297void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000299 if (PyUnicode_CheckExact(unicode) &&
300 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000301 /* Keep-Alive optimization */
302 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000303 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 unicode->str = NULL;
305 unicode->length = 0;
306 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000307 if (unicode->defenc) {
308 Py_DECREF(unicode->defenc);
309 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000310 }
311 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 *(PyUnicodeObject **)unicode = unicode_freelist;
313 unicode_freelist = unicode;
314 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 }
316 else {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000317 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000318 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000319 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 }
321}
322
Martin v. Löwis18e16552006-02-15 17:27:45 +0000323int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000324{
325 register PyUnicodeObject *v;
326
327 /* Argument checks */
328 if (unicode == NULL) {
329 PyErr_BadInternalCall();
330 return -1;
331 }
332 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000333 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 PyErr_BadInternalCall();
335 return -1;
336 }
337
338 /* Resizing unicode_empty and single character objects is not
339 possible since these are being shared. We simply return a fresh
340 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000341 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 (v == unicode_empty || v->length == 1)) {
343 PyUnicodeObject *w = _PyUnicode_New(length);
344 if (w == NULL)
345 return -1;
346 Py_UNICODE_COPY(w->str, v->str,
347 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000348 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000349 *unicode = (PyObject *)w;
350 return 0;
351 }
352
353 /* Note that we don't have to modify *unicode for unshared Unicode
354 objects, since we can modify them in-place. */
355 return unicode_resize(v, length);
356}
357
358/* Internal API for use in unicodeobject.c only ! */
359#define _PyUnicode_Resize(unicodevar, length) \
360 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
361
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000363 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364{
365 PyUnicodeObject *unicode;
366
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000367 /* If the Unicode data is known at construction time, we can apply
368 some optimizations which share commonly used objects. */
369 if (u != NULL) {
370
371 /* Optimization for empty strings */
372 if (size == 0 && unicode_empty != NULL) {
373 Py_INCREF(unicode_empty);
374 return (PyObject *)unicode_empty;
375 }
376
377 /* Single character Unicode objects in the Latin-1 range are
378 shared when using this constructor */
379 if (size == 1 && *u < 256) {
380 unicode = unicode_latin1[*u];
381 if (!unicode) {
382 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000383 if (!unicode)
384 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000385 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000386 unicode_latin1[*u] = unicode;
387 }
388 Py_INCREF(unicode);
389 return (PyObject *)unicode;
390 }
391 }
Tim Petersced69f82003-09-16 20:30:58 +0000392
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393 unicode = _PyUnicode_New(size);
394 if (!unicode)
395 return NULL;
396
397 /* Copy the Unicode data into the new object */
398 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000399 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400
401 return (PyObject *)unicode;
402}
403
404#ifdef HAVE_WCHAR_H
405
406PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000407 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408{
409 PyUnicodeObject *unicode;
410
411 if (w == NULL) {
412 PyErr_BadInternalCall();
413 return NULL;
414 }
415
416 unicode = _PyUnicode_New(size);
417 if (!unicode)
418 return NULL;
419
420 /* Copy the wchar_t data into the new object */
421#ifdef HAVE_USABLE_WCHAR_T
422 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000423#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000424 {
425 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000426 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000428 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429 *u++ = *w++;
430 }
431#endif
432
433 return (PyObject *)unicode;
434}
435
Martin v. Löwis18e16552006-02-15 17:27:45 +0000436Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
437 wchar_t *w,
438 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000439{
440 if (unicode == NULL) {
441 PyErr_BadInternalCall();
442 return -1;
443 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000444
445 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000446 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000447 size = PyUnicode_GET_SIZE(unicode) + 1;
448
Guido van Rossumd57fd912000-03-10 22:53:23 +0000449#ifdef HAVE_USABLE_WCHAR_T
450 memcpy(w, unicode->str, size * sizeof(wchar_t));
451#else
452 {
453 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000454 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000456 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457 *w++ = *u++;
458 }
459#endif
460
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000461 if (size > PyUnicode_GET_SIZE(unicode))
462 return PyUnicode_GET_SIZE(unicode);
463 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464 return size;
465}
466
467#endif
468
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000469PyObject *PyUnicode_FromOrdinal(int ordinal)
470{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000471 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000472
473#ifdef Py_UNICODE_WIDE
474 if (ordinal < 0 || ordinal > 0x10ffff) {
475 PyErr_SetString(PyExc_ValueError,
476 "unichr() arg not in range(0x110000) "
477 "(wide Python build)");
478 return NULL;
479 }
480#else
481 if (ordinal < 0 || ordinal > 0xffff) {
482 PyErr_SetString(PyExc_ValueError,
483 "unichr() arg not in range(0x10000) "
484 "(narrow Python build)");
485 return NULL;
486 }
487#endif
488
Hye-Shik Chang40574832004-04-06 07:24:51 +0000489 s[0] = (Py_UNICODE)ordinal;
490 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000491}
492
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493PyObject *PyUnicode_FromObject(register PyObject *obj)
494{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000495 /* XXX Perhaps we should make this API an alias of
496 PyObject_Unicode() instead ?! */
497 if (PyUnicode_CheckExact(obj)) {
498 Py_INCREF(obj);
499 return obj;
500 }
501 if (PyUnicode_Check(obj)) {
502 /* For a Unicode subtype that's not a Unicode object,
503 return a true Unicode object with the same data. */
504 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
505 PyUnicode_GET_SIZE(obj));
506 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000507 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
508}
509
510PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
511 const char *encoding,
512 const char *errors)
513{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000514 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000515 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000517
Guido van Rossumd57fd912000-03-10 22:53:23 +0000518 if (obj == NULL) {
519 PyErr_BadInternalCall();
520 return NULL;
521 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000522
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000523#if 0
524 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000525 that no encodings is given and then redirect to
526 PyObject_Unicode() which then applies the additional logic for
527 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000528
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000529 NOTE: This API should really only be used for object which
530 represent *encoded* Unicode !
531
532 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000533 if (PyUnicode_Check(obj)) {
534 if (encoding) {
535 PyErr_SetString(PyExc_TypeError,
536 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000538 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000539 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000540 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000541#else
542 if (PyUnicode_Check(obj)) {
543 PyErr_SetString(PyExc_TypeError,
544 "decoding Unicode is not supported");
545 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000546 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000547#endif
548
549 /* Coerce object */
550 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000551 s = PyString_AS_STRING(obj);
552 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000553 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000554 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
555 /* Overwrite the error message with something more useful in
556 case of a TypeError. */
557 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000558 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000559 "coercing to Unicode: need string or buffer, "
560 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000561 obj->ob_type->tp_name);
562 goto onError;
563 }
Tim Petersced69f82003-09-16 20:30:58 +0000564
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000565 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566 if (len == 0) {
567 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000568 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000569 }
Tim Petersced69f82003-09-16 20:30:58 +0000570 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000571 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000572
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000573 return v;
574
575 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000576 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577}
578
579PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000580 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581 const char *encoding,
582 const char *errors)
583{
584 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000585
586 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000587 encoding = PyUnicode_GetDefaultEncoding();
588
589 /* Shortcuts for common default encodings */
590 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000592 else if (strcmp(encoding, "latin-1") == 0)
593 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000594#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
595 else if (strcmp(encoding, "mbcs") == 0)
596 return PyUnicode_DecodeMBCS(s, size, errors);
597#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000598 else if (strcmp(encoding, "ascii") == 0)
599 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000600
601 /* Decode via the codec registry */
602 buffer = PyBuffer_FromMemory((void *)s, size);
603 if (buffer == NULL)
604 goto onError;
605 unicode = PyCodec_Decode(buffer, encoding, errors);
606 if (unicode == NULL)
607 goto onError;
608 if (!PyUnicode_Check(unicode)) {
609 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000610 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000611 unicode->ob_type->tp_name);
612 Py_DECREF(unicode);
613 goto onError;
614 }
615 Py_DECREF(buffer);
616 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000617
Guido van Rossumd57fd912000-03-10 22:53:23 +0000618 onError:
619 Py_XDECREF(buffer);
620 return NULL;
621}
622
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000623PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
624 const char *encoding,
625 const char *errors)
626{
627 PyObject *v;
628
629 if (!PyUnicode_Check(unicode)) {
630 PyErr_BadArgument();
631 goto onError;
632 }
633
634 if (encoding == NULL)
635 encoding = PyUnicode_GetDefaultEncoding();
636
637 /* Decode via the codec registry */
638 v = PyCodec_Decode(unicode, encoding, errors);
639 if (v == NULL)
640 goto onError;
641 return v;
642
643 onError:
644 return NULL;
645}
646
Guido van Rossumd57fd912000-03-10 22:53:23 +0000647PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000648 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649 const char *encoding,
650 const char *errors)
651{
652 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000653
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 unicode = PyUnicode_FromUnicode(s, size);
655 if (unicode == NULL)
656 return NULL;
657 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
658 Py_DECREF(unicode);
659 return v;
660}
661
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000662PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
663 const char *encoding,
664 const char *errors)
665{
666 PyObject *v;
667
668 if (!PyUnicode_Check(unicode)) {
669 PyErr_BadArgument();
670 goto onError;
671 }
672
673 if (encoding == NULL)
674 encoding = PyUnicode_GetDefaultEncoding();
675
676 /* Encode via the codec registry */
677 v = PyCodec_Encode(unicode, encoding, errors);
678 if (v == NULL)
679 goto onError;
680 return v;
681
682 onError:
683 return NULL;
684}
685
Guido van Rossumd57fd912000-03-10 22:53:23 +0000686PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
687 const char *encoding,
688 const char *errors)
689{
690 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000691
Guido van Rossumd57fd912000-03-10 22:53:23 +0000692 if (!PyUnicode_Check(unicode)) {
693 PyErr_BadArgument();
694 goto onError;
695 }
Fred Drakee4315f52000-05-09 19:53:39 +0000696
Tim Petersced69f82003-09-16 20:30:58 +0000697 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000698 encoding = PyUnicode_GetDefaultEncoding();
699
700 /* Shortcuts for common default encodings */
701 if (errors == NULL) {
702 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000703 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000704 else if (strcmp(encoding, "latin-1") == 0)
705 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000706#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
707 else if (strcmp(encoding, "mbcs") == 0)
708 return PyUnicode_AsMBCSString(unicode);
709#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000710 else if (strcmp(encoding, "ascii") == 0)
711 return PyUnicode_AsASCIIString(unicode);
712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000713
714 /* Encode via the codec registry */
715 v = PyCodec_Encode(unicode, encoding, errors);
716 if (v == NULL)
717 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718 if (!PyString_Check(v)) {
719 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000720 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000721 v->ob_type->tp_name);
722 Py_DECREF(v);
723 goto onError;
724 }
725 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000726
Guido van Rossumd57fd912000-03-10 22:53:23 +0000727 onError:
728 return NULL;
729}
730
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000731PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
732 const char *errors)
733{
734 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
735
736 if (v)
737 return v;
738 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
739 if (v && errors == NULL)
740 ((PyUnicodeObject *)unicode)->defenc = v;
741 return v;
742}
743
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
745{
746 if (!PyUnicode_Check(unicode)) {
747 PyErr_BadArgument();
748 goto onError;
749 }
750 return PyUnicode_AS_UNICODE(unicode);
751
752 onError:
753 return NULL;
754}
755
Martin v. Löwis18e16552006-02-15 17:27:45 +0000756Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757{
758 if (!PyUnicode_Check(unicode)) {
759 PyErr_BadArgument();
760 goto onError;
761 }
762 return PyUnicode_GET_SIZE(unicode);
763
764 onError:
765 return -1;
766}
767
Thomas Wouters78890102000-07-22 19:25:51 +0000768const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000769{
770 return unicode_default_encoding;
771}
772
773int PyUnicode_SetDefaultEncoding(const char *encoding)
774{
775 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000776
Fred Drakee4315f52000-05-09 19:53:39 +0000777 /* Make sure the encoding is valid. As side effect, this also
778 loads the encoding into the codec registry cache. */
779 v = _PyCodec_Lookup(encoding);
780 if (v == NULL)
781 goto onError;
782 Py_DECREF(v);
783 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000784 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000785 sizeof(unicode_default_encoding));
786 return 0;
787
788 onError:
789 return -1;
790}
791
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000792/* error handling callback helper:
793 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000794 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000795 and adjust various state variables.
796 return 0 on success, -1 on error
797*/
798
799static
800int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
801 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000802 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
803 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000804{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000805 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000806
807 PyObject *restuple = NULL;
808 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000809 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
810 Py_ssize_t requiredsize;
811 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000812 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000813 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000814 int res = -1;
815
816 if (*errorHandler == NULL) {
817 *errorHandler = PyCodec_LookupError(errors);
818 if (*errorHandler == NULL)
819 goto onError;
820 }
821
822 if (*exceptionObject == NULL) {
823 *exceptionObject = PyUnicodeDecodeError_Create(
824 encoding, input, insize, *startinpos, *endinpos, reason);
825 if (*exceptionObject == NULL)
826 goto onError;
827 }
828 else {
829 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
830 goto onError;
831 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
832 goto onError;
833 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
834 goto onError;
835 }
836
837 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
838 if (restuple == NULL)
839 goto onError;
840 if (!PyTuple_Check(restuple)) {
841 PyErr_Format(PyExc_TypeError, &argparse[4]);
842 goto onError;
843 }
844 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
845 goto onError;
846 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000847 newpos = insize+newpos;
848 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000849 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000850 goto onError;
851 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000852
853 /* need more space? (at least enough for what we
854 have+the replacement+the rest of the string (starting
855 at the new input position), so we won't have to check space
856 when there are no errors in the rest of the string) */
857 repptr = PyUnicode_AS_UNICODE(repunicode);
858 repsize = PyUnicode_GET_SIZE(repunicode);
859 requiredsize = *outpos + repsize + insize-newpos;
860 if (requiredsize > outsize) {
861 if (requiredsize<2*outsize)
862 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000863 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000864 goto onError;
865 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
866 }
867 *endinpos = newpos;
868 *inptr = input + newpos;
869 Py_UNICODE_COPY(*outptr, repptr, repsize);
870 *outptr += repsize;
871 *outpos += repsize;
872 /* we made it! */
873 res = 0;
874
875 onError:
876 Py_XDECREF(restuple);
877 return res;
878}
879
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000880/* --- UTF-7 Codec -------------------------------------------------------- */
881
882/* see RFC2152 for details */
883
Tim Petersced69f82003-09-16 20:30:58 +0000884static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000885char utf7_special[128] = {
886 /* indicate whether a UTF-7 character is special i.e. cannot be directly
887 encoded:
888 0 - not special
889 1 - special
890 2 - whitespace (optional)
891 3 - RFC2152 Set O (optional) */
892 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
893 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
894 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
896 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
898 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
900
901};
902
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000903/* Note: The comparison (c) <= 0 is a trick to work-around gcc
904 warnings about the comparison always being false; since
905 utf7_special[0] is 1, we can safely make that one comparison
906 true */
907
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000909 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000910 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000911 (encodeO && (utf7_special[(c)] == 3)))
912
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000913#define B64(n) \
914 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
915#define B64CHAR(c) \
916 (isalnum(c) || (c) == '+' || (c) == '/')
917#define UB64(c) \
918 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
919 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000920
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000921#define ENCODE(out, ch, bits) \
922 while (bits >= 6) { \
923 *out++ = B64(ch >> (bits-6)); \
924 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000925 }
926
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000927#define DECODE(out, ch, bits, surrogate) \
928 while (bits >= 16) { \
929 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
930 bits -= 16; \
931 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000932 /* We have already generated an error for the high surrogate \
933 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000934 surrogate = 0; \
935 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000936 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000937 it in a 16-bit character */ \
938 surrogate = 1; \
939 errmsg = "code pairs are not supported"; \
940 goto utf7Error; \
941 } else { \
942 *out++ = outCh; \
943 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000944 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000947 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000948 const char *errors)
949{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000950 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000951 Py_ssize_t startinpos;
952 Py_ssize_t endinpos;
953 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000954 const char *e;
955 PyUnicodeObject *unicode;
956 Py_UNICODE *p;
957 const char *errmsg = "";
958 int inShift = 0;
959 unsigned int bitsleft = 0;
960 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000961 int surrogate = 0;
962 PyObject *errorHandler = NULL;
963 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000964
965 unicode = _PyUnicode_New(size);
966 if (!unicode)
967 return NULL;
968 if (size == 0)
969 return (PyObject *)unicode;
970
971 p = unicode->str;
972 e = s + size;
973
974 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000975 Py_UNICODE ch;
976 restart:
977 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000978
979 if (inShift) {
980 if ((ch == '-') || !B64CHAR(ch)) {
981 inShift = 0;
982 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000983
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000984 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
985 if (bitsleft >= 6) {
986 /* The shift sequence has a partial character in it. If
987 bitsleft < 6 then we could just classify it as padding
988 but that is not the case here */
989
990 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000991 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000992 }
993 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000994 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000995 here so indicate the potential of a misencoded character. */
996
997 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
998 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
999 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001000 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001001 }
1002
1003 if (ch == '-') {
1004 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001005 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001006 inShift = 1;
1007 }
1008 } else if (SPECIAL(ch,0,0)) {
1009 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001010 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 } else {
1012 *p++ = ch;
1013 }
1014 } else {
1015 charsleft = (charsleft << 6) | UB64(ch);
1016 bitsleft += 6;
1017 s++;
1018 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1019 }
1020 }
1021 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001022 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001023 s++;
1024 if (s < e && *s == '-') {
1025 s++;
1026 *p++ = '+';
1027 } else
1028 {
1029 inShift = 1;
1030 bitsleft = 0;
1031 }
1032 }
1033 else if (SPECIAL(ch,0,0)) {
1034 errmsg = "unexpected special character";
1035 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001036 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001037 }
1038 else {
1039 *p++ = ch;
1040 s++;
1041 }
1042 continue;
1043 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001044 outpos = p-PyUnicode_AS_UNICODE(unicode);
1045 endinpos = s-starts;
1046 if (unicode_decode_call_errorhandler(
1047 errors, &errorHandler,
1048 "utf7", errmsg,
1049 starts, size, &startinpos, &endinpos, &exc, &s,
1050 (PyObject **)&unicode, &outpos, &p))
1051 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001052 }
1053
1054 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001055 outpos = p-PyUnicode_AS_UNICODE(unicode);
1056 endinpos = size;
1057 if (unicode_decode_call_errorhandler(
1058 errors, &errorHandler,
1059 "utf7", "unterminated shift sequence",
1060 starts, size, &startinpos, &endinpos, &exc, &s,
1061 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001062 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001063 if (s < e)
1064 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001065 }
1066
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001067 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 goto onError;
1069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001070 Py_XDECREF(errorHandler);
1071 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001072 return (PyObject *)unicode;
1073
1074onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001075 Py_XDECREF(errorHandler);
1076 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001077 Py_DECREF(unicode);
1078 return NULL;
1079}
1080
1081
1082PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001083 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001084 int encodeSetO,
1085 int encodeWhiteSpace,
1086 const char *errors)
1087{
1088 PyObject *v;
1089 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001090 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001091 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001092 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 unsigned int bitsleft = 0;
1094 unsigned long charsleft = 0;
1095 char * out;
1096 char * start;
1097
1098 if (size == 0)
1099 return PyString_FromStringAndSize(NULL, 0);
1100
1101 v = PyString_FromStringAndSize(NULL, cbAllocated);
1102 if (v == NULL)
1103 return NULL;
1104
1105 start = out = PyString_AS_STRING(v);
1106 for (;i < size; ++i) {
1107 Py_UNICODE ch = s[i];
1108
1109 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001110 if (ch == '+') {
1111 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001112 *out++ = '-';
1113 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1114 charsleft = ch;
1115 bitsleft = 16;
1116 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001117 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001118 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001119 } else {
1120 *out++ = (char) ch;
1121 }
1122 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001123 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1124 *out++ = B64(charsleft << (6-bitsleft));
1125 charsleft = 0;
1126 bitsleft = 0;
1127 /* Characters not in the BASE64 set implicitly unshift the sequence
1128 so no '-' is required, except if the character is itself a '-' */
1129 if (B64CHAR(ch) || ch == '-') {
1130 *out++ = '-';
1131 }
1132 inShift = 0;
1133 *out++ = (char) ch;
1134 } else {
1135 bitsleft += 16;
1136 charsleft = (charsleft << 16) | ch;
1137 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1138
1139 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001140 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001141 or '-' then the shift sequence will be terminated implicitly and we
1142 don't have to insert a '-'. */
1143
1144 if (bitsleft == 0) {
1145 if (i + 1 < size) {
1146 Py_UNICODE ch2 = s[i+1];
1147
1148 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001149
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001150 } else if (B64CHAR(ch2) || ch2 == '-') {
1151 *out++ = '-';
1152 inShift = 0;
1153 } else {
1154 inShift = 0;
1155 }
1156
1157 }
1158 else {
1159 *out++ = '-';
1160 inShift = 0;
1161 }
1162 }
Tim Petersced69f82003-09-16 20:30:58 +00001163 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001164 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001165 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001166 if (bitsleft) {
1167 *out++= B64(charsleft << (6-bitsleft) );
1168 *out++ = '-';
1169 }
1170
Tim Peters5de98422002-04-27 18:44:32 +00001171 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001172 return v;
1173}
1174
1175#undef SPECIAL
1176#undef B64
1177#undef B64CHAR
1178#undef UB64
1179#undef ENCODE
1180#undef DECODE
1181
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182/* --- UTF-8 Codec -------------------------------------------------------- */
1183
Tim Petersced69f82003-09-16 20:30:58 +00001184static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185char utf8_code_length[256] = {
1186 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1187 illegal prefix. see RFC 2279 for details */
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1193 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1194 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1195 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1200 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1201 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1202 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1203 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1204};
1205
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001207 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208 const char *errors)
1209{
Walter Dörwald69652032004-09-07 20:24:22 +00001210 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1211}
1212
1213PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001214 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001215 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001217{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001218 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001220 Py_ssize_t startinpos;
1221 Py_ssize_t endinpos;
1222 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 const char *e;
1224 PyUnicodeObject *unicode;
1225 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001226 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001227 PyObject *errorHandler = NULL;
1228 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229
1230 /* Note: size will always be longer than the resulting Unicode
1231 character count */
1232 unicode = _PyUnicode_New(size);
1233 if (!unicode)
1234 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001235 if (size == 0) {
1236 if (consumed)
1237 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240
1241 /* Unpack UTF-8 encoded data */
1242 p = unicode->str;
1243 e = s + size;
1244
1245 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001246 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247
1248 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001249 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 s++;
1251 continue;
1252 }
1253
1254 n = utf8_code_length[ch];
1255
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001256 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001257 if (consumed)
1258 break;
1259 else {
1260 errmsg = "unexpected end of data";
1261 startinpos = s-starts;
1262 endinpos = size;
1263 goto utf8Error;
1264 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266
1267 switch (n) {
1268
1269 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001270 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001271 startinpos = s-starts;
1272 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001273 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274
1275 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001276 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001277 startinpos = s-starts;
1278 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001279 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280
1281 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 if ((s[1] & 0xc0) != 0x80) {
1283 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001284 startinpos = s-starts;
1285 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001286 goto utf8Error;
1287 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001289 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001290 startinpos = s-starts;
1291 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 errmsg = "illegal encoding";
1293 goto utf8Error;
1294 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001296 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297 break;
1298
1299 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001300 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 (s[2] & 0xc0) != 0x80) {
1302 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001303 startinpos = s-starts;
1304 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001305 goto utf8Error;
1306 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001308 if (ch < 0x0800) {
1309 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001310 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001311
1312 XXX For wide builds (UCS-4) we should probably try
1313 to recombine the surrogates into a single code
1314 unit.
1315 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001317 startinpos = s-starts;
1318 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001319 goto utf8Error;
1320 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001322 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001323 break;
1324
1325 case 4:
1326 if ((s[1] & 0xc0) != 0x80 ||
1327 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 (s[3] & 0xc0) != 0x80) {
1329 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001330 startinpos = s-starts;
1331 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001332 goto utf8Error;
1333 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001334 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1335 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1336 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001338 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001339 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001340 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001341 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001342 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001343 startinpos = s-starts;
1344 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001345 goto utf8Error;
1346 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001347#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001348 *p++ = (Py_UNICODE)ch;
1349#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001350 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001351
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001352 /* translate from 10000..10FFFF to 0..FFFF */
1353 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001354
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001355 /* high surrogate = top 10 bits added to D800 */
1356 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001357
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001358 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001359 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001360#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 break;
1362
1363 default:
1364 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001365 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001366 startinpos = s-starts;
1367 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001368 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001369 }
1370 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001371 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001372
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001373 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001374 outpos = p-PyUnicode_AS_UNICODE(unicode);
1375 if (unicode_decode_call_errorhandler(
1376 errors, &errorHandler,
1377 "utf8", errmsg,
1378 starts, size, &startinpos, &endinpos, &exc, &s,
1379 (PyObject **)&unicode, &outpos, &p))
1380 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 }
Walter Dörwald69652032004-09-07 20:24:22 +00001382 if (consumed)
1383 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384
1385 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001386 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 goto onError;
1388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001389 Py_XDECREF(errorHandler);
1390 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391 return (PyObject *)unicode;
1392
1393onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001394 Py_XDECREF(errorHandler);
1395 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 Py_DECREF(unicode);
1397 return NULL;
1398}
1399
Tim Peters602f7402002-04-27 18:03:26 +00001400/* Allocation strategy: if the string is short, convert into a stack buffer
1401 and allocate exactly as much space needed at the end. Else allocate the
1402 maximum possible needed (4 result bytes per Unicode character), and return
1403 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001404*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001405PyObject *
1406PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001407 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001408 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409{
Tim Peters602f7402002-04-27 18:03:26 +00001410#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001411
Martin v. Löwis18e16552006-02-15 17:27:45 +00001412 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001413 PyObject *v; /* result string object */
1414 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001416 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001417 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001418
Tim Peters602f7402002-04-27 18:03:26 +00001419 assert(s != NULL);
1420 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421
Tim Peters602f7402002-04-27 18:03:26 +00001422 if (size <= MAX_SHORT_UNICHARS) {
1423 /* Write into the stack buffer; nallocated can't overflow.
1424 * At the end, we'll allocate exactly as much heap space as it
1425 * turns out we need.
1426 */
1427 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1428 v = NULL; /* will allocate after we're done */
1429 p = stackbuf;
1430 }
1431 else {
1432 /* Overallocate on the heap, and give the excess back at the end. */
1433 nallocated = size * 4;
1434 if (nallocated / 4 != size) /* overflow! */
1435 return PyErr_NoMemory();
1436 v = PyString_FromStringAndSize(NULL, nallocated);
1437 if (v == NULL)
1438 return NULL;
1439 p = PyString_AS_STRING(v);
1440 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441
Tim Peters602f7402002-04-27 18:03:26 +00001442 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001443 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001444
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001445 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001446 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001448
Guido van Rossumd57fd912000-03-10 22:53:23 +00001449 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001450 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001451 *p++ = (char)(0xc0 | (ch >> 6));
1452 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001453 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001454 else {
Tim Peters602f7402002-04-27 18:03:26 +00001455 /* Encode UCS2 Unicode ordinals */
1456 if (ch < 0x10000) {
1457 /* Special case: check for high surrogate */
1458 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1459 Py_UCS4 ch2 = s[i];
1460 /* Check for low surrogate and combine the two to
1461 form a UCS4 value */
1462 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001463 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001464 i++;
1465 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001466 }
Tim Peters602f7402002-04-27 18:03:26 +00001467 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001468 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001469 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001470 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1471 *p++ = (char)(0x80 | (ch & 0x3f));
1472 continue;
1473 }
1474encodeUCS4:
1475 /* Encode UCS4 Unicode ordinals */
1476 *p++ = (char)(0xf0 | (ch >> 18));
1477 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1478 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1479 *p++ = (char)(0x80 | (ch & 0x3f));
1480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001482
Tim Peters602f7402002-04-27 18:03:26 +00001483 if (v == NULL) {
1484 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001485 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001486 assert(nneeded <= nallocated);
1487 v = PyString_FromStringAndSize(stackbuf, nneeded);
1488 }
1489 else {
1490 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001491 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001492 assert(nneeded <= nallocated);
1493 _PyString_Resize(&v, nneeded);
1494 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001495 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001496
Tim Peters602f7402002-04-27 18:03:26 +00001497#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498}
1499
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1501{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502 if (!PyUnicode_Check(unicode)) {
1503 PyErr_BadArgument();
1504 return NULL;
1505 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001506 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1507 PyUnicode_GET_SIZE(unicode),
1508 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509}
1510
1511/* --- UTF-16 Codec ------------------------------------------------------- */
1512
Tim Peters772747b2001-08-09 22:21:55 +00001513PyObject *
1514PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001515 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001516 const char *errors,
1517 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518{
Walter Dörwald69652032004-09-07 20:24:22 +00001519 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1520}
1521
1522PyObject *
1523PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001524 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001525 const char *errors,
1526 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001527 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001528{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001529 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001530 Py_ssize_t startinpos;
1531 Py_ssize_t endinpos;
1532 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533 PyUnicodeObject *unicode;
1534 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001535 const unsigned char *q, *e;
1536 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001537 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001538 /* Offsets from q for retrieving byte pairs in the right order. */
1539#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1540 int ihi = 1, ilo = 0;
1541#else
1542 int ihi = 0, ilo = 1;
1543#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001544 PyObject *errorHandler = NULL;
1545 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001546
1547 /* Note: size will always be longer than the resulting Unicode
1548 character count */
1549 unicode = _PyUnicode_New(size);
1550 if (!unicode)
1551 return NULL;
1552 if (size == 0)
1553 return (PyObject *)unicode;
1554
1555 /* Unpack UTF-16 encoded data */
1556 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001557 q = (unsigned char *)s;
1558 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001559
1560 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001561 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001563 /* Check for BOM marks (U+FEFF) in the input and adjust current
1564 byte order setting accordingly. In native mode, the leading BOM
1565 mark is skipped, in all other modes, it is copied to the output
1566 stream as-is (giving a ZWNBSP character). */
1567 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001568 if (size >= 2) {
1569 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001570#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001571 if (bom == 0xFEFF) {
1572 q += 2;
1573 bo = -1;
1574 }
1575 else if (bom == 0xFFFE) {
1576 q += 2;
1577 bo = 1;
1578 }
Tim Petersced69f82003-09-16 20:30:58 +00001579#else
Walter Dörwald69652032004-09-07 20:24:22 +00001580 if (bom == 0xFEFF) {
1581 q += 2;
1582 bo = 1;
1583 }
1584 else if (bom == 0xFFFE) {
1585 q += 2;
1586 bo = -1;
1587 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001588#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001589 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591
Tim Peters772747b2001-08-09 22:21:55 +00001592 if (bo == -1) {
1593 /* force LE */
1594 ihi = 1;
1595 ilo = 0;
1596 }
1597 else if (bo == 1) {
1598 /* force BE */
1599 ihi = 0;
1600 ilo = 1;
1601 }
1602
1603 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001604 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001605 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001606 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001607 if (consumed)
1608 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001609 errmsg = "truncated data";
1610 startinpos = ((const char *)q)-starts;
1611 endinpos = ((const char *)e)-starts;
1612 goto utf16Error;
1613 /* The remaining input chars are ignored if the callback
1614 chooses to skip the input */
1615 }
1616 ch = (q[ihi] << 8) | q[ilo];
1617
Tim Peters772747b2001-08-09 22:21:55 +00001618 q += 2;
1619
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 if (ch < 0xD800 || ch > 0xDFFF) {
1621 *p++ = ch;
1622 continue;
1623 }
1624
1625 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001626 if (q >= e) {
1627 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 startinpos = (((const char *)q)-2)-starts;
1629 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001630 goto utf16Error;
1631 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001632 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001633 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1634 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001635 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001636#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001637 *p++ = ch;
1638 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001639#else
1640 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001641#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001642 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001643 }
1644 else {
1645 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646 startinpos = (((const char *)q)-4)-starts;
1647 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001648 goto utf16Error;
1649 }
1650
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001652 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001653 startinpos = (((const char *)q)-2)-starts;
1654 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001655 /* Fall through to report the error */
1656
1657 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001658 outpos = p-PyUnicode_AS_UNICODE(unicode);
1659 if (unicode_decode_call_errorhandler(
1660 errors, &errorHandler,
1661 "utf16", errmsg,
1662 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1663 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001664 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 }
1666
1667 if (byteorder)
1668 *byteorder = bo;
1669
Walter Dörwald69652032004-09-07 20:24:22 +00001670 if (consumed)
1671 *consumed = (const char *)q-starts;
1672
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001674 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 goto onError;
1676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001677 Py_XDECREF(errorHandler);
1678 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 return (PyObject *)unicode;
1680
1681onError:
1682 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001683 Py_XDECREF(errorHandler);
1684 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 return NULL;
1686}
1687
Tim Peters772747b2001-08-09 22:21:55 +00001688PyObject *
1689PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001690 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001691 const char *errors,
1692 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693{
1694 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001695 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001696#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001697 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001698#else
1699 const int pairs = 0;
1700#endif
Tim Peters772747b2001-08-09 22:21:55 +00001701 /* Offsets from p for storing byte pairs in the right order. */
1702#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1703 int ihi = 1, ilo = 0;
1704#else
1705 int ihi = 0, ilo = 1;
1706#endif
1707
1708#define STORECHAR(CH) \
1709 do { \
1710 p[ihi] = ((CH) >> 8) & 0xff; \
1711 p[ilo] = (CH) & 0xff; \
1712 p += 2; \
1713 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001715#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001716 for (i = pairs = 0; i < size; i++)
1717 if (s[i] >= 0x10000)
1718 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001719#endif
Tim Petersced69f82003-09-16 20:30:58 +00001720 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001721 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 if (v == NULL)
1723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724
Tim Peters772747b2001-08-09 22:21:55 +00001725 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001727 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001728 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001729 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001730
1731 if (byteorder == -1) {
1732 /* force LE */
1733 ihi = 1;
1734 ilo = 0;
1735 }
1736 else if (byteorder == 1) {
1737 /* force BE */
1738 ihi = 0;
1739 ilo = 1;
1740 }
1741
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001742 while (size-- > 0) {
1743 Py_UNICODE ch = *s++;
1744 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001745#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001746 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001747 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1748 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001750#endif
Tim Peters772747b2001-08-09 22:21:55 +00001751 STORECHAR(ch);
1752 if (ch2)
1753 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001756#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757}
1758
1759PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1760{
1761 if (!PyUnicode_Check(unicode)) {
1762 PyErr_BadArgument();
1763 return NULL;
1764 }
1765 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1766 PyUnicode_GET_SIZE(unicode),
1767 NULL,
1768 0);
1769}
1770
1771/* --- Unicode Escape Codec ----------------------------------------------- */
1772
Fredrik Lundh06d12682001-01-24 07:59:11 +00001773static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001774
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777 const char *errors)
1778{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001780 Py_ssize_t startinpos;
1781 Py_ssize_t endinpos;
1782 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001783 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001787 char* message;
1788 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001789 PyObject *errorHandler = NULL;
1790 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001791
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792 /* Escaped strings will always be longer than the resulting
1793 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001794 length after conversion to the true value.
1795 (but if the error callback returns a long replacement string
1796 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 v = _PyUnicode_New(size);
1798 if (v == NULL)
1799 goto onError;
1800 if (size == 0)
1801 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001802
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001804 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001805
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806 while (s < end) {
1807 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001808 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810
1811 /* Non-escape characters are interpreted as Unicode ordinals */
1812 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001813 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 continue;
1815 }
1816
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001817 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818 /* \ - Escapes */
1819 s++;
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001820 c = *s++;
1821 if (s > end)
1822 c = '\0'; /* Invalid after \ */
1823 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824
1825 /* \x escapes */
1826 case '\n': break;
1827 case '\\': *p++ = '\\'; break;
1828 case '\'': *p++ = '\''; break;
1829 case '\"': *p++ = '\"'; break;
1830 case 'b': *p++ = '\b'; break;
1831 case 'f': *p++ = '\014'; break; /* FF */
1832 case 't': *p++ = '\t'; break;
1833 case 'n': *p++ = '\n'; break;
1834 case 'r': *p++ = '\r'; break;
1835 case 'v': *p++ = '\013'; break; /* VT */
1836 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1837
1838 /* \OOO (octal) escapes */
1839 case '0': case '1': case '2': case '3':
1840 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001841 x = s[-1] - '0';
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001842 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001843 x = (x<<3) + *s++ - '0';
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001844 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001845 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001847 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 break;
1849
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850 /* hex escapes */
1851 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001853 digits = 2;
1854 message = "truncated \\xXX escape";
1855 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856
Fredrik Lundhccc74732001-02-18 22:13:49 +00001857 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001859 digits = 4;
1860 message = "truncated \\uXXXX escape";
1861 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862
Fredrik Lundhccc74732001-02-18 22:13:49 +00001863 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001864 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001865 digits = 8;
1866 message = "truncated \\UXXXXXXXX escape";
1867 hexescape:
1868 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 outpos = p-PyUnicode_AS_UNICODE(v);
1870 if (s+digits>end) {
1871 endinpos = size;
1872 if (unicode_decode_call_errorhandler(
1873 errors, &errorHandler,
1874 "unicodeescape", "end of string in escape sequence",
1875 starts, size, &startinpos, &endinpos, &exc, &s,
1876 (PyObject **)&v, &outpos, &p))
1877 goto onError;
1878 goto nextByte;
1879 }
1880 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001881 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001882 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 endinpos = (s+i+1)-starts;
1884 if (unicode_decode_call_errorhandler(
1885 errors, &errorHandler,
1886 "unicodeescape", message,
1887 starts, size, &startinpos, &endinpos, &exc, &s,
1888 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001889 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001890 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001891 }
1892 chr = (chr<<4) & ~0xF;
1893 if (c >= '0' && c <= '9')
1894 chr += c - '0';
1895 else if (c >= 'a' && c <= 'f')
1896 chr += 10 + c - 'a';
1897 else
1898 chr += 10 + c - 'A';
1899 }
1900 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001901 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001902 /* _decoding_error will have already written into the
1903 target buffer. */
1904 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001905 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001906 /* when we get here, chr is a 32-bit unicode character */
1907 if (chr <= 0xffff)
1908 /* UCS-2 character */
1909 *p++ = (Py_UNICODE) chr;
1910 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001911 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001912 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001913#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001914 *p++ = chr;
1915#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001916 chr -= 0x10000L;
1917 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001918 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001919#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001920 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001921 endinpos = s-starts;
1922 outpos = p-PyUnicode_AS_UNICODE(v);
1923 if (unicode_decode_call_errorhandler(
1924 errors, &errorHandler,
1925 "unicodeescape", "illegal Unicode character",
1926 starts, size, &startinpos, &endinpos, &exc, &s,
1927 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001928 goto onError;
1929 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001930 break;
1931
1932 /* \N{name} */
1933 case 'N':
1934 message = "malformed \\N character escape";
1935 if (ucnhash_CAPI == NULL) {
1936 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001937 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001938 m = PyImport_ImportModule("unicodedata");
1939 if (m == NULL)
1940 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001941 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001942 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001943 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001944 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001945 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001946 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001947 if (ucnhash_CAPI == NULL)
1948 goto ucnhashError;
1949 }
1950 if (*s == '{') {
1951 const char *start = s+1;
1952 /* look for the closing brace */
1953 while (*s != '}' && s < end)
1954 s++;
1955 if (s > start && s < end && *s == '}') {
1956 /* found a name. look it up in the unicode database */
1957 message = "unknown Unicode character name";
1958 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001959 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001960 goto store;
1961 }
1962 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001963 endinpos = s-starts;
1964 outpos = p-PyUnicode_AS_UNICODE(v);
1965 if (unicode_decode_call_errorhandler(
1966 errors, &errorHandler,
1967 "unicodeescape", message,
1968 starts, size, &startinpos, &endinpos, &exc, &s,
1969 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001970 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001971 break;
1972
1973 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001974 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001975 message = "\\ at end of string";
1976 s--;
1977 endinpos = s-starts;
1978 outpos = p-PyUnicode_AS_UNICODE(v);
1979 if (unicode_decode_call_errorhandler(
1980 errors, &errorHandler,
1981 "unicodeescape", message,
1982 starts, size, &startinpos, &endinpos, &exc, &s,
1983 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001984 goto onError;
1985 }
1986 else {
1987 *p++ = '\\';
1988 *p++ = (unsigned char)s[-1];
1989 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001990 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001992 nextByte:
1993 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001995 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001996 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001997 Py_XDECREF(errorHandler);
1998 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001999 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002000
Fredrik Lundhccc74732001-02-18 22:13:49 +00002001ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002002 PyErr_SetString(
2003 PyExc_UnicodeError,
2004 "\\N escapes not supported (can't load unicodedata module)"
2005 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002006 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002007 Py_XDECREF(errorHandler);
2008 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002009 return NULL;
2010
Fredrik Lundhccc74732001-02-18 22:13:49 +00002011onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002012 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002013 Py_XDECREF(errorHandler);
2014 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 return NULL;
2016}
2017
2018/* Return a Unicode-Escape string version of the Unicode object.
2019
2020 If quotes is true, the string is enclosed in u"" or u'' quotes as
2021 appropriate.
2022
2023*/
2024
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002025Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002026 Py_ssize_t size,
2027 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002028{
2029 /* like wcschr, but doesn't stop at NULL characters */
2030
2031 while (size-- > 0) {
2032 if (*s == ch)
2033 return s;
2034 s++;
2035 }
2036
2037 return NULL;
2038}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002039
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040static
2041PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002042 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 int quotes)
2044{
2045 PyObject *repr;
2046 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002048 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002050 /* Initial allocation is based on the longest-possible unichr
2051 escape.
2052
2053 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2054 unichr, so in this case it's the longest unichr escape. In
2055 narrow (UTF-16) builds this is five chars per source unichr
2056 since there are two unichrs in the surrogate pair, so in narrow
2057 (UTF-16) builds it's not the longest unichr escape.
2058
2059 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2060 so in the narrow (UTF-16) build case it's the longest unichr
2061 escape.
2062 */
2063
2064 repr = PyString_FromStringAndSize(NULL,
2065 2
2066#ifdef Py_UNICODE_WIDE
2067 + 10*size
2068#else
2069 + 6*size
2070#endif
2071 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 if (repr == NULL)
2073 return NULL;
2074
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002075 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076
2077 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002079 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 !findchar(s, size, '"')) ? '"' : '\'';
2081 }
2082 while (size-- > 0) {
2083 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002084
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002085 /* Escape quotes and backslashes */
2086 if ((quotes &&
2087 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088 *p++ = '\\';
2089 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002090 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002091 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002092
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002093#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002094 /* Map 21-bit characters to '\U00xxxxxx' */
2095 else if (ch >= 0x10000) {
2096 *p++ = '\\';
2097 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002098 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2099 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2100 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2101 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2102 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2103 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2104 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002105 *p++ = hexdigit[ch & 0x0000000F];
2106 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002107 }
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002108#else
2109 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002110 else if (ch >= 0xD800 && ch < 0xDC00) {
2111 Py_UNICODE ch2;
2112 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002113
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002114 ch2 = *s++;
2115 size--;
2116 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2117 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2118 *p++ = '\\';
2119 *p++ = 'U';
2120 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2121 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2122 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2123 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2124 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2125 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2126 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2127 *p++ = hexdigit[ucs & 0x0000000F];
2128 continue;
2129 }
2130 /* Fall through: isolated surrogates are copied as-is */
2131 s--;
2132 size++;
2133 }
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002134#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002135
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002137 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138 *p++ = '\\';
2139 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002140 *p++ = hexdigit[(ch >> 12) & 0x000F];
2141 *p++ = hexdigit[(ch >> 8) & 0x000F];
2142 *p++ = hexdigit[(ch >> 4) & 0x000F];
2143 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002145
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002146 /* Map special whitespace to '\t', \n', '\r' */
2147 else if (ch == '\t') {
2148 *p++ = '\\';
2149 *p++ = 't';
2150 }
2151 else if (ch == '\n') {
2152 *p++ = '\\';
2153 *p++ = 'n';
2154 }
2155 else if (ch == '\r') {
2156 *p++ = '\\';
2157 *p++ = 'r';
2158 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002159
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002160 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002161 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002163 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002164 *p++ = hexdigit[(ch >> 4) & 0x000F];
2165 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002166 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002167
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 /* Copy everything else as-is */
2169 else
2170 *p++ = (char) ch;
2171 }
2172 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002173 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174
2175 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002176 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177 return repr;
2178}
2179
2180PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002181 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182{
2183 return unicodeescape_string(s, size, 0);
2184}
2185
2186PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2187{
2188 if (!PyUnicode_Check(unicode)) {
2189 PyErr_BadArgument();
2190 return NULL;
2191 }
2192 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2193 PyUnicode_GET_SIZE(unicode));
2194}
2195
2196/* --- Raw Unicode Escape Codec ------------------------------------------- */
2197
2198PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002199 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200 const char *errors)
2201{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002202 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002203 Py_ssize_t startinpos;
2204 Py_ssize_t endinpos;
2205 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002207 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208 const char *end;
2209 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002210 PyObject *errorHandler = NULL;
2211 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002212
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 /* Escaped strings will always be longer than the resulting
2214 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 length after conversion to the true value. (But decoding error
2216 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 v = _PyUnicode_New(size);
2218 if (v == NULL)
2219 goto onError;
2220 if (size == 0)
2221 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002222 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223 end = s + size;
2224 while (s < end) {
2225 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002226 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002228 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229
2230 /* Non-escape characters are interpreted as Unicode ordinals */
2231 if (*s != '\\') {
2232 *p++ = (unsigned char)*s++;
2233 continue;
2234 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002235 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236
2237 /* \u-escapes are only interpreted iff the number of leading
2238 backslashes if odd */
2239 bs = s;
2240 for (;s < end;) {
2241 if (*s != '\\')
2242 break;
2243 *p++ = (unsigned char)*s++;
2244 }
2245 if (((s - bs) & 1) == 0 ||
2246 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002247 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 continue;
2249 }
2250 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002251 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252 s++;
2253
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002254 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002255 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002256 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002257 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002259 endinpos = s-starts;
2260 if (unicode_decode_call_errorhandler(
2261 errors, &errorHandler,
2262 "rawunicodeescape", "truncated \\uXXXX",
2263 starts, size, &startinpos, &endinpos, &exc, &s,
2264 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 }
2268 x = (x<<4) & ~0xF;
2269 if (c >= '0' && c <= '9')
2270 x += c - '0';
2271 else if (c >= 'a' && c <= 'f')
2272 x += 10 + c - 'a';
2273 else
2274 x += 10 + c - 'A';
2275 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002276#ifndef Py_UNICODE_WIDE
2277 if (x > 0x10000) {
2278 if (unicode_decode_call_errorhandler(
2279 errors, &errorHandler,
2280 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2281 starts, size, &startinpos, &endinpos, &exc, &s,
2282 (PyObject **)&v, &outpos, &p))
2283 goto onError;
2284 }
2285#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002286 *p++ = x;
2287 nextByte:
2288 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002290 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002291 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002292 Py_XDECREF(errorHandler);
2293 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002295
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 onError:
2297 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002298 Py_XDECREF(errorHandler);
2299 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 return NULL;
2301}
2302
2303PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002304 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305{
2306 PyObject *repr;
2307 char *p;
2308 char *q;
2309
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002310 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002311
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002312#ifdef Py_UNICODE_WIDE
2313 repr = PyString_FromStringAndSize(NULL, 10 * size);
2314#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002316#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002317 if (repr == NULL)
2318 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002319 if (size == 0)
2320 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002321
2322 p = q = PyString_AS_STRING(repr);
2323 while (size-- > 0) {
2324 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002325#ifdef Py_UNICODE_WIDE
2326 /* Map 32-bit characters to '\Uxxxxxxxx' */
2327 if (ch >= 0x10000) {
2328 *p++ = '\\';
2329 *p++ = 'U';
2330 *p++ = hexdigit[(ch >> 28) & 0xf];
2331 *p++ = hexdigit[(ch >> 24) & 0xf];
2332 *p++ = hexdigit[(ch >> 20) & 0xf];
2333 *p++ = hexdigit[(ch >> 16) & 0xf];
2334 *p++ = hexdigit[(ch >> 12) & 0xf];
2335 *p++ = hexdigit[(ch >> 8) & 0xf];
2336 *p++ = hexdigit[(ch >> 4) & 0xf];
2337 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002338 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002339 else
2340#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002341 /* Map 16-bit characters to '\uxxxx' */
2342 if (ch >= 256) {
2343 *p++ = '\\';
2344 *p++ = 'u';
2345 *p++ = hexdigit[(ch >> 12) & 0xf];
2346 *p++ = hexdigit[(ch >> 8) & 0xf];
2347 *p++ = hexdigit[(ch >> 4) & 0xf];
2348 *p++ = hexdigit[ch & 15];
2349 }
2350 /* Copy everything else as-is */
2351 else
2352 *p++ = (char) ch;
2353 }
2354 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002355 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356 return repr;
2357}
2358
2359PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2360{
2361 if (!PyUnicode_Check(unicode)) {
2362 PyErr_BadArgument();
2363 return NULL;
2364 }
2365 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2366 PyUnicode_GET_SIZE(unicode));
2367}
2368
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002369/* --- Unicode Internal Codec ------------------------------------------- */
2370
2371PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002372 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002373 const char *errors)
2374{
2375 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002376 Py_ssize_t startinpos;
2377 Py_ssize_t endinpos;
2378 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002379 PyUnicodeObject *v;
2380 Py_UNICODE *p;
2381 const char *end;
2382 const char *reason;
2383 PyObject *errorHandler = NULL;
2384 PyObject *exc = NULL;
2385
Neal Norwitzd43069c2006-01-08 01:12:10 +00002386#ifdef Py_UNICODE_WIDE
2387 Py_UNICODE unimax = PyUnicode_GetMax();
2388#endif
2389
Armin Rigo4b63c212006-10-04 11:44:06 +00002390 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002391 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2392 if (v == NULL)
2393 goto onError;
2394 if (PyUnicode_GetSize((PyObject *)v) == 0)
2395 return (PyObject *)v;
2396 p = PyUnicode_AS_UNICODE(v);
2397 end = s + size;
2398
2399 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002400 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002401 /* We have to sanity check the raw data, otherwise doom looms for
2402 some malformed UCS-4 data. */
2403 if (
2404 #ifdef Py_UNICODE_WIDE
2405 *p > unimax || *p < 0 ||
2406 #endif
2407 end-s < Py_UNICODE_SIZE
2408 )
2409 {
2410 startinpos = s - starts;
2411 if (end-s < Py_UNICODE_SIZE) {
2412 endinpos = end-starts;
2413 reason = "truncated input";
2414 }
2415 else {
2416 endinpos = s - starts + Py_UNICODE_SIZE;
2417 reason = "illegal code point (> 0x10FFFF)";
2418 }
2419 outpos = p - PyUnicode_AS_UNICODE(v);
2420 if (unicode_decode_call_errorhandler(
2421 errors, &errorHandler,
2422 "unicode_internal", reason,
2423 starts, size, &startinpos, &endinpos, &exc, &s,
2424 (PyObject **)&v, &outpos, &p)) {
2425 goto onError;
2426 }
2427 }
2428 else {
2429 p++;
2430 s += Py_UNICODE_SIZE;
2431 }
2432 }
2433
Martin v. Löwis412fb672006-04-13 06:34:32 +00002434 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002435 goto onError;
2436 Py_XDECREF(errorHandler);
2437 Py_XDECREF(exc);
2438 return (PyObject *)v;
2439
2440 onError:
2441 Py_XDECREF(v);
2442 Py_XDECREF(errorHandler);
2443 Py_XDECREF(exc);
2444 return NULL;
2445}
2446
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447/* --- Latin-1 Codec ------------------------------------------------------ */
2448
2449PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002450 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002451 const char *errors)
2452{
2453 PyUnicodeObject *v;
2454 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002455
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002457 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002458 Py_UNICODE r = *(unsigned char*)s;
2459 return PyUnicode_FromUnicode(&r, 1);
2460 }
2461
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462 v = _PyUnicode_New(size);
2463 if (v == NULL)
2464 goto onError;
2465 if (size == 0)
2466 return (PyObject *)v;
2467 p = PyUnicode_AS_UNICODE(v);
2468 while (size-- > 0)
2469 *p++ = (unsigned char)*s++;
2470 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002471
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 onError:
2473 Py_XDECREF(v);
2474 return NULL;
2475}
2476
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002477/* create or adjust a UnicodeEncodeError */
2478static void make_encode_exception(PyObject **exceptionObject,
2479 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002480 const Py_UNICODE *unicode, Py_ssize_t size,
2481 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002482 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002484 if (*exceptionObject == NULL) {
2485 *exceptionObject = PyUnicodeEncodeError_Create(
2486 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 }
2488 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002489 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2490 goto onError;
2491 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2492 goto onError;
2493 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2494 goto onError;
2495 return;
2496 onError:
2497 Py_DECREF(*exceptionObject);
2498 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499 }
2500}
2501
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002502/* raises a UnicodeEncodeError */
2503static void raise_encode_exception(PyObject **exceptionObject,
2504 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002505 const Py_UNICODE *unicode, Py_ssize_t size,
2506 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002507 const char *reason)
2508{
2509 make_encode_exception(exceptionObject,
2510 encoding, unicode, size, startpos, endpos, reason);
2511 if (*exceptionObject != NULL)
2512 PyCodec_StrictErrors(*exceptionObject);
2513}
2514
2515/* error handling callback helper:
2516 build arguments, call the callback and check the arguments,
2517 put the result into newpos and return the replacement string, which
2518 has to be freed by the caller */
2519static PyObject *unicode_encode_call_errorhandler(const char *errors,
2520 PyObject **errorHandler,
2521 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002522 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2523 Py_ssize_t startpos, Py_ssize_t endpos,
2524 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002525{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002526 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002527
2528 PyObject *restuple;
2529 PyObject *resunicode;
2530
2531 if (*errorHandler == NULL) {
2532 *errorHandler = PyCodec_LookupError(errors);
2533 if (*errorHandler == NULL)
2534 return NULL;
2535 }
2536
2537 make_encode_exception(exceptionObject,
2538 encoding, unicode, size, startpos, endpos, reason);
2539 if (*exceptionObject == NULL)
2540 return NULL;
2541
2542 restuple = PyObject_CallFunctionObjArgs(
2543 *errorHandler, *exceptionObject, NULL);
2544 if (restuple == NULL)
2545 return NULL;
2546 if (!PyTuple_Check(restuple)) {
2547 PyErr_Format(PyExc_TypeError, &argparse[4]);
2548 Py_DECREF(restuple);
2549 return NULL;
2550 }
2551 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2552 &resunicode, newpos)) {
2553 Py_DECREF(restuple);
2554 return NULL;
2555 }
2556 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002557 *newpos = size+*newpos;
2558 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002559 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002560 Py_DECREF(restuple);
2561 return NULL;
2562 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563 Py_INCREF(resunicode);
2564 Py_DECREF(restuple);
2565 return resunicode;
2566}
2567
2568static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002569 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002570 const char *errors,
2571 int limit)
2572{
2573 /* output object */
2574 PyObject *res;
2575 /* pointers to the beginning and end+1 of input */
2576 const Py_UNICODE *startp = p;
2577 const Py_UNICODE *endp = p + size;
2578 /* pointer to the beginning of the unencodable characters */
2579 /* const Py_UNICODE *badp = NULL; */
2580 /* pointer into the output */
2581 char *str;
2582 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002583 Py_ssize_t respos = 0;
2584 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002585 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2586 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002587 PyObject *errorHandler = NULL;
2588 PyObject *exc = NULL;
2589 /* the following variable is used for caching string comparisons
2590 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2591 int known_errorHandler = -1;
2592
2593 /* allocate enough for a simple encoding without
2594 replacements, if we need more, we'll resize */
2595 res = PyString_FromStringAndSize(NULL, size);
2596 if (res == NULL)
2597 goto onError;
2598 if (size == 0)
2599 return res;
2600 str = PyString_AS_STRING(res);
2601 ressize = size;
2602
2603 while (p<endp) {
2604 Py_UNICODE c = *p;
2605
2606 /* can we encode this? */
2607 if (c<limit) {
2608 /* no overflow check, because we know that the space is enough */
2609 *str++ = (char)c;
2610 ++p;
2611 }
2612 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002613 Py_ssize_t unicodepos = p-startp;
2614 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002615 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002616 Py_ssize_t repsize;
2617 Py_ssize_t newpos;
2618 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002619 Py_UNICODE *uni2;
2620 /* startpos for collecting unencodable chars */
2621 const Py_UNICODE *collstart = p;
2622 const Py_UNICODE *collend = p;
2623 /* find all unecodable characters */
2624 while ((collend < endp) && ((*collend)>=limit))
2625 ++collend;
2626 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2627 if (known_errorHandler==-1) {
2628 if ((errors==NULL) || (!strcmp(errors, "strict")))
2629 known_errorHandler = 1;
2630 else if (!strcmp(errors, "replace"))
2631 known_errorHandler = 2;
2632 else if (!strcmp(errors, "ignore"))
2633 known_errorHandler = 3;
2634 else if (!strcmp(errors, "xmlcharrefreplace"))
2635 known_errorHandler = 4;
2636 else
2637 known_errorHandler = 0;
2638 }
2639 switch (known_errorHandler) {
2640 case 1: /* strict */
2641 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2642 goto onError;
2643 case 2: /* replace */
2644 while (collstart++<collend)
2645 *str++ = '?'; /* fall through */
2646 case 3: /* ignore */
2647 p = collend;
2648 break;
2649 case 4: /* xmlcharrefreplace */
2650 respos = str-PyString_AS_STRING(res);
2651 /* determine replacement size (temporarily (mis)uses p) */
2652 for (p = collstart, repsize = 0; p < collend; ++p) {
2653 if (*p<10)
2654 repsize += 2+1+1;
2655 else if (*p<100)
2656 repsize += 2+2+1;
2657 else if (*p<1000)
2658 repsize += 2+3+1;
2659 else if (*p<10000)
2660 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002661#ifndef Py_UNICODE_WIDE
2662 else
2663 repsize += 2+5+1;
2664#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002665 else if (*p<100000)
2666 repsize += 2+5+1;
2667 else if (*p<1000000)
2668 repsize += 2+6+1;
2669 else
2670 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002671#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002672 }
2673 requiredsize = respos+repsize+(endp-collend);
2674 if (requiredsize > ressize) {
2675 if (requiredsize<2*ressize)
2676 requiredsize = 2*ressize;
2677 if (_PyString_Resize(&res, requiredsize))
2678 goto onError;
2679 str = PyString_AS_STRING(res) + respos;
2680 ressize = requiredsize;
2681 }
2682 /* generate replacement (temporarily (mis)uses p) */
2683 for (p = collstart; p < collend; ++p) {
2684 str += sprintf(str, "&#%d;", (int)*p);
2685 }
2686 p = collend;
2687 break;
2688 default:
2689 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2690 encoding, reason, startp, size, &exc,
2691 collstart-startp, collend-startp, &newpos);
2692 if (repunicode == NULL)
2693 goto onError;
2694 /* need more space? (at least enough for what we
2695 have+the replacement+the rest of the string, so
2696 we won't have to check space for encodable characters) */
2697 respos = str-PyString_AS_STRING(res);
2698 repsize = PyUnicode_GET_SIZE(repunicode);
2699 requiredsize = respos+repsize+(endp-collend);
2700 if (requiredsize > ressize) {
2701 if (requiredsize<2*ressize)
2702 requiredsize = 2*ressize;
2703 if (_PyString_Resize(&res, requiredsize)) {
2704 Py_DECREF(repunicode);
2705 goto onError;
2706 }
2707 str = PyString_AS_STRING(res) + respos;
2708 ressize = requiredsize;
2709 }
2710 /* check if there is anything unencodable in the replacement
2711 and copy it to the output */
2712 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2713 c = *uni2;
2714 if (c >= limit) {
2715 raise_encode_exception(&exc, encoding, startp, size,
2716 unicodepos, unicodepos+1, reason);
2717 Py_DECREF(repunicode);
2718 goto onError;
2719 }
2720 *str = (char)c;
2721 }
2722 p = startp + newpos;
2723 Py_DECREF(repunicode);
2724 }
2725 }
2726 }
2727 /* Resize if we allocated to much */
2728 respos = str-PyString_AS_STRING(res);
2729 if (respos<ressize)
2730 /* If this falls res will be NULL */
2731 _PyString_Resize(&res, respos);
2732 Py_XDECREF(errorHandler);
2733 Py_XDECREF(exc);
2734 return res;
2735
2736 onError:
2737 Py_XDECREF(res);
2738 Py_XDECREF(errorHandler);
2739 Py_XDECREF(exc);
2740 return NULL;
2741}
2742
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002744 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 const char *errors)
2746{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002747 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748}
2749
2750PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2751{
2752 if (!PyUnicode_Check(unicode)) {
2753 PyErr_BadArgument();
2754 return NULL;
2755 }
2756 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2757 PyUnicode_GET_SIZE(unicode),
2758 NULL);
2759}
2760
2761/* --- 7-bit ASCII Codec -------------------------------------------------- */
2762
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002764 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765 const char *errors)
2766{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002767 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768 PyUnicodeObject *v;
2769 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002770 Py_ssize_t startinpos;
2771 Py_ssize_t endinpos;
2772 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002773 const char *e;
2774 PyObject *errorHandler = NULL;
2775 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002776
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002778 if (size == 1 && *(unsigned char*)s < 128) {
2779 Py_UNICODE r = *(unsigned char*)s;
2780 return PyUnicode_FromUnicode(&r, 1);
2781 }
Tim Petersced69f82003-09-16 20:30:58 +00002782
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 v = _PyUnicode_New(size);
2784 if (v == NULL)
2785 goto onError;
2786 if (size == 0)
2787 return (PyObject *)v;
2788 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002789 e = s + size;
2790 while (s < e) {
2791 register unsigned char c = (unsigned char)*s;
2792 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002794 ++s;
2795 }
2796 else {
2797 startinpos = s-starts;
2798 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002799 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800 if (unicode_decode_call_errorhandler(
2801 errors, &errorHandler,
2802 "ascii", "ordinal not in range(128)",
2803 starts, size, &startinpos, &endinpos, &exc, &s,
2804 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002806 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002808 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002809 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002810 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002811 Py_XDECREF(errorHandler);
2812 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002814
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 onError:
2816 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 Py_XDECREF(errorHandler);
2818 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 return NULL;
2820}
2821
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002823 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 const char *errors)
2825{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827}
2828
2829PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2830{
2831 if (!PyUnicode_Check(unicode)) {
2832 PyErr_BadArgument();
2833 return NULL;
2834 }
2835 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2836 PyUnicode_GET_SIZE(unicode),
2837 NULL);
2838}
2839
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002840#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002841
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002842/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002843
Martin v. Löwisd8251432006-06-14 05:21:04 +00002844#if SIZEOF_INT < SIZEOF_SSIZE_T
2845#define NEED_RETRY
2846#endif
2847
2848/* XXX This code is limited to "true" double-byte encodings, as
2849 a) it assumes an incomplete character consists of a single byte, and
2850 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2851 encodings, see IsDBCSLeadByteEx documentation. */
2852
2853static int is_dbcs_lead_byte(const char *s, int offset)
2854{
2855 const char *curr = s + offset;
2856
2857 if (IsDBCSLeadByte(*curr)) {
2858 const char *prev = CharPrev(s, curr);
2859 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2860 }
2861 return 0;
2862}
2863
2864/*
2865 * Decode MBCS string into unicode object. If 'final' is set, converts
2866 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2867 */
2868static int decode_mbcs(PyUnicodeObject **v,
2869 const char *s, /* MBCS string */
2870 int size, /* sizeof MBCS string */
2871 int final)
2872{
2873 Py_UNICODE *p;
2874 Py_ssize_t n = 0;
2875 int usize = 0;
2876
2877 assert(size >= 0);
2878
2879 /* Skip trailing lead-byte unless 'final' is set */
2880 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2881 --size;
2882
2883 /* First get the size of the result */
2884 if (size > 0) {
2885 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2886 if (usize == 0) {
2887 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2888 return -1;
2889 }
2890 }
2891
2892 if (*v == NULL) {
2893 /* Create unicode object */
2894 *v = _PyUnicode_New(usize);
2895 if (*v == NULL)
2896 return -1;
2897 }
2898 else {
2899 /* Extend unicode object */
2900 n = PyUnicode_GET_SIZE(*v);
2901 if (_PyUnicode_Resize(v, n + usize) < 0)
2902 return -1;
2903 }
2904
2905 /* Do the conversion */
2906 if (size > 0) {
2907 p = PyUnicode_AS_UNICODE(*v) + n;
2908 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2909 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2910 return -1;
2911 }
2912 }
2913
2914 return size;
2915}
2916
2917PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2918 Py_ssize_t size,
2919 const char *errors,
2920 Py_ssize_t *consumed)
2921{
2922 PyUnicodeObject *v = NULL;
2923 int done;
2924
2925 if (consumed)
2926 *consumed = 0;
2927
2928#ifdef NEED_RETRY
2929 retry:
2930 if (size > INT_MAX)
2931 done = decode_mbcs(&v, s, INT_MAX, 0);
2932 else
2933#endif
2934 done = decode_mbcs(&v, s, (int)size, !consumed);
2935
2936 if (done < 0) {
2937 Py_XDECREF(v);
2938 return NULL;
2939 }
2940
2941 if (consumed)
2942 *consumed += done;
2943
2944#ifdef NEED_RETRY
2945 if (size > INT_MAX) {
2946 s += done;
2947 size -= done;
2948 goto retry;
2949 }
2950#endif
2951
2952 return (PyObject *)v;
2953}
2954
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002955PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002956 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002957 const char *errors)
2958{
Martin v. Löwisd8251432006-06-14 05:21:04 +00002959 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2960}
2961
2962/*
2963 * Convert unicode into string object (MBCS).
2964 * Returns 0 if succeed, -1 otherwise.
2965 */
2966static int encode_mbcs(PyObject **repr,
2967 const Py_UNICODE *p, /* unicode */
2968 int size) /* size of unicode */
2969{
2970 int mbcssize = 0;
2971 Py_ssize_t n = 0;
2972
2973 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002974
2975 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00002976 if (size > 0) {
2977 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2978 if (mbcssize == 0) {
2979 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2980 return -1;
2981 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002982 }
2983
Martin v. Löwisd8251432006-06-14 05:21:04 +00002984 if (*repr == NULL) {
2985 /* Create string object */
2986 *repr = PyString_FromStringAndSize(NULL, mbcssize);
2987 if (*repr == NULL)
2988 return -1;
2989 }
2990 else {
2991 /* Extend string object */
2992 n = PyString_Size(*repr);
2993 if (_PyString_Resize(repr, n + mbcssize) < 0)
2994 return -1;
2995 }
2996
2997 /* Do the conversion */
2998 if (size > 0) {
2999 char *s = PyString_AS_STRING(*repr) + n;
3000 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3001 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3002 return -1;
3003 }
3004 }
3005
3006 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003007}
3008
3009PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003010 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003011 const char *errors)
3012{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003013 PyObject *repr = NULL;
3014 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003015
Martin v. Löwisd8251432006-06-14 05:21:04 +00003016#ifdef NEED_RETRY
3017 retry:
3018 if (size > INT_MAX)
3019 ret = encode_mbcs(&repr, p, INT_MAX);
3020 else
3021#endif
3022 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003023
Martin v. Löwisd8251432006-06-14 05:21:04 +00003024 if (ret < 0) {
3025 Py_XDECREF(repr);
3026 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003027 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003028
3029#ifdef NEED_RETRY
3030 if (size > INT_MAX) {
3031 p += INT_MAX;
3032 size -= INT_MAX;
3033 goto retry;
3034 }
3035#endif
3036
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003037 return repr;
3038}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003039
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003040PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3041{
3042 if (!PyUnicode_Check(unicode)) {
3043 PyErr_BadArgument();
3044 return NULL;
3045 }
3046 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3047 PyUnicode_GET_SIZE(unicode),
3048 NULL);
3049}
3050
Martin v. Löwisd8251432006-06-14 05:21:04 +00003051#undef NEED_RETRY
3052
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003053#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003054
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055/* --- Character Mapping Codec -------------------------------------------- */
3056
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003058 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 PyObject *mapping,
3060 const char *errors)
3061{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003062 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003063 Py_ssize_t startinpos;
3064 Py_ssize_t endinpos;
3065 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 PyUnicodeObject *v;
3068 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003069 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 PyObject *errorHandler = NULL;
3071 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003072 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003073 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003074
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 /* Default to Latin-1 */
3076 if (mapping == NULL)
3077 return PyUnicode_DecodeLatin1(s, size, errors);
3078
3079 v = _PyUnicode_New(size);
3080 if (v == NULL)
3081 goto onError;
3082 if (size == 0)
3083 return (PyObject *)v;
3084 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003085 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003086 if (PyUnicode_CheckExact(mapping)) {
3087 mapstring = PyUnicode_AS_UNICODE(mapping);
3088 maplen = PyUnicode_GET_SIZE(mapping);
3089 while (s < e) {
3090 unsigned char ch = *s;
3091 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003093 if (ch < maplen)
3094 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003096 if (x == 0xfffe) {
3097 /* undefined mapping */
3098 outpos = p-PyUnicode_AS_UNICODE(v);
3099 startinpos = s-starts;
3100 endinpos = startinpos+1;
3101 if (unicode_decode_call_errorhandler(
3102 errors, &errorHandler,
3103 "charmap", "character maps to <undefined>",
3104 starts, size, &startinpos, &endinpos, &exc, &s,
3105 (PyObject **)&v, &outpos, &p)) {
3106 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003107 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003108 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003109 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003110 *p++ = x;
3111 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003113 }
3114 else {
3115 while (s < e) {
3116 unsigned char ch = *s;
3117 PyObject *w, *x;
3118
3119 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3120 w = PyInt_FromLong((long)ch);
3121 if (w == NULL)
3122 goto onError;
3123 x = PyObject_GetItem(mapping, w);
3124 Py_DECREF(w);
3125 if (x == NULL) {
3126 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3127 /* No mapping found means: mapping is undefined. */
3128 PyErr_Clear();
3129 x = Py_None;
3130 Py_INCREF(x);
3131 } else
3132 goto onError;
3133 }
3134
3135 /* Apply mapping */
3136 if (PyInt_Check(x)) {
3137 long value = PyInt_AS_LONG(x);
3138 if (value < 0 || value > 65535) {
3139 PyErr_SetString(PyExc_TypeError,
3140 "character mapping must be in range(65536)");
3141 Py_DECREF(x);
3142 goto onError;
3143 }
3144 *p++ = (Py_UNICODE)value;
3145 }
3146 else if (x == Py_None) {
3147 /* undefined mapping */
3148 outpos = p-PyUnicode_AS_UNICODE(v);
3149 startinpos = s-starts;
3150 endinpos = startinpos+1;
3151 if (unicode_decode_call_errorhandler(
3152 errors, &errorHandler,
3153 "charmap", "character maps to <undefined>",
3154 starts, size, &startinpos, &endinpos, &exc, &s,
3155 (PyObject **)&v, &outpos, &p)) {
3156 Py_DECREF(x);
3157 goto onError;
3158 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003159 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003160 continue;
3161 }
3162 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003163 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003164
3165 if (targetsize == 1)
3166 /* 1-1 mapping */
3167 *p++ = *PyUnicode_AS_UNICODE(x);
3168
3169 else if (targetsize > 1) {
3170 /* 1-n mapping */
3171 if (targetsize > extrachars) {
3172 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003173 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3174 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003175 (targetsize << 2);
3176 extrachars += needed;
Armin Rigo4b63c212006-10-04 11:44:06 +00003177 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003178 if (_PyUnicode_Resize(&v,
3179 PyUnicode_GET_SIZE(v) + needed) < 0) {
3180 Py_DECREF(x);
3181 goto onError;
3182 }
3183 p = PyUnicode_AS_UNICODE(v) + oldpos;
3184 }
3185 Py_UNICODE_COPY(p,
3186 PyUnicode_AS_UNICODE(x),
3187 targetsize);
3188 p += targetsize;
3189 extrachars -= targetsize;
3190 }
3191 /* 1-0 mapping: skip the character */
3192 }
3193 else {
3194 /* wrong return value */
3195 PyErr_SetString(PyExc_TypeError,
3196 "character mapping must return integer, None or unicode");
3197 Py_DECREF(x);
3198 goto onError;
3199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003201 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203 }
3204 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003205 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003207 Py_XDECREF(errorHandler);
3208 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003210
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003212 Py_XDECREF(errorHandler);
3213 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 Py_XDECREF(v);
3215 return NULL;
3216}
3217
Martin v. Löwis3f767792006-06-04 19:36:28 +00003218/* Charmap encoding: the lookup table */
3219
3220struct encoding_map{
3221 PyObject_HEAD
3222 unsigned char level1[32];
3223 int count2, count3;
3224 unsigned char level23[1];
3225};
3226
3227static PyObject*
3228encoding_map_size(PyObject *obj, PyObject* args)
3229{
3230 struct encoding_map *map = (struct encoding_map*)obj;
3231 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3232 128*map->count3);
3233}
3234
3235static PyMethodDef encoding_map_methods[] = {
3236 {"size", encoding_map_size, METH_NOARGS,
3237 PyDoc_STR("Return the size (in bytes) of this object") },
3238 { 0 }
3239};
3240
3241static void
3242encoding_map_dealloc(PyObject* o)
3243{
3244 PyObject_FREE(o);
3245}
3246
3247static PyTypeObject EncodingMapType = {
3248 PyObject_HEAD_INIT(NULL)
3249 0, /*ob_size*/
3250 "EncodingMap", /*tp_name*/
3251 sizeof(struct encoding_map), /*tp_basicsize*/
3252 0, /*tp_itemsize*/
3253 /* methods */
3254 encoding_map_dealloc, /*tp_dealloc*/
3255 0, /*tp_print*/
3256 0, /*tp_getattr*/
3257 0, /*tp_setattr*/
3258 0, /*tp_compare*/
3259 0, /*tp_repr*/
3260 0, /*tp_as_number*/
3261 0, /*tp_as_sequence*/
3262 0, /*tp_as_mapping*/
3263 0, /*tp_hash*/
3264 0, /*tp_call*/
3265 0, /*tp_str*/
3266 0, /*tp_getattro*/
3267 0, /*tp_setattro*/
3268 0, /*tp_as_buffer*/
3269 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3270 0, /*tp_doc*/
3271 0, /*tp_traverse*/
3272 0, /*tp_clear*/
3273 0, /*tp_richcompare*/
3274 0, /*tp_weaklistoffset*/
3275 0, /*tp_iter*/
3276 0, /*tp_iternext*/
3277 encoding_map_methods, /*tp_methods*/
3278 0, /*tp_members*/
3279 0, /*tp_getset*/
3280 0, /*tp_base*/
3281 0, /*tp_dict*/
3282 0, /*tp_descr_get*/
3283 0, /*tp_descr_set*/
3284 0, /*tp_dictoffset*/
3285 0, /*tp_init*/
3286 0, /*tp_alloc*/
3287 0, /*tp_new*/
3288 0, /*tp_free*/
3289 0, /*tp_is_gc*/
3290};
3291
3292PyObject*
3293PyUnicode_BuildEncodingMap(PyObject* string)
3294{
3295 Py_UNICODE *decode;
3296 PyObject *result;
3297 struct encoding_map *mresult;
3298 int i;
3299 int need_dict = 0;
3300 unsigned char level1[32];
3301 unsigned char level2[512];
3302 unsigned char *mlevel1, *mlevel2, *mlevel3;
3303 int count2 = 0, count3 = 0;
3304
3305 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3306 PyErr_BadArgument();
3307 return NULL;
3308 }
3309 decode = PyUnicode_AS_UNICODE(string);
3310 memset(level1, 0xFF, sizeof level1);
3311 memset(level2, 0xFF, sizeof level2);
3312
3313 /* If there isn't a one-to-one mapping of NULL to \0,
3314 or if there are non-BMP characters, we need to use
3315 a mapping dictionary. */
3316 if (decode[0] != 0)
3317 need_dict = 1;
3318 for (i = 1; i < 256; i++) {
3319 int l1, l2;
3320 if (decode[i] == 0
3321 #ifdef Py_UNICODE_WIDE
3322 || decode[i] > 0xFFFF
3323 #endif
3324 ) {
3325 need_dict = 1;
3326 break;
3327 }
3328 if (decode[i] == 0xFFFE)
3329 /* unmapped character */
3330 continue;
3331 l1 = decode[i] >> 11;
3332 l2 = decode[i] >> 7;
3333 if (level1[l1] == 0xFF)
3334 level1[l1] = count2++;
3335 if (level2[l2] == 0xFF)
3336 level2[l2] = count3++;
3337 }
3338
3339 if (count2 >= 0xFF || count3 >= 0xFF)
3340 need_dict = 1;
3341
3342 if (need_dict) {
3343 PyObject *result = PyDict_New();
3344 PyObject *key, *value;
3345 if (!result)
3346 return NULL;
3347 for (i = 0; i < 256; i++) {
3348 key = value = NULL;
3349 key = PyInt_FromLong(decode[i]);
3350 value = PyInt_FromLong(i);
3351 if (!key || !value)
3352 goto failed1;
3353 if (PyDict_SetItem(result, key, value) == -1)
3354 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003355 Py_DECREF(key);
3356 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003357 }
3358 return result;
3359 failed1:
3360 Py_XDECREF(key);
3361 Py_XDECREF(value);
3362 Py_DECREF(result);
3363 return NULL;
3364 }
3365
3366 /* Create a three-level trie */
3367 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3368 16*count2 + 128*count3 - 1);
3369 if (!result)
3370 return PyErr_NoMemory();
3371 PyObject_Init(result, &EncodingMapType);
3372 mresult = (struct encoding_map*)result;
3373 mresult->count2 = count2;
3374 mresult->count3 = count3;
3375 mlevel1 = mresult->level1;
3376 mlevel2 = mresult->level23;
3377 mlevel3 = mresult->level23 + 16*count2;
3378 memcpy(mlevel1, level1, 32);
3379 memset(mlevel2, 0xFF, 16*count2);
3380 memset(mlevel3, 0, 128*count3);
3381 count3 = 0;
3382 for (i = 1; i < 256; i++) {
3383 int o1, o2, o3, i2, i3;
3384 if (decode[i] == 0xFFFE)
3385 /* unmapped character */
3386 continue;
3387 o1 = decode[i]>>11;
3388 o2 = (decode[i]>>7) & 0xF;
3389 i2 = 16*mlevel1[o1] + o2;
3390 if (mlevel2[i2] == 0xFF)
3391 mlevel2[i2] = count3++;
3392 o3 = decode[i] & 0x7F;
3393 i3 = 128*mlevel2[i2] + o3;
3394 mlevel3[i3] = i;
3395 }
3396 return result;
3397}
3398
3399static int
3400encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3401{
3402 struct encoding_map *map = (struct encoding_map*)mapping;
3403 int l1 = c>>11;
3404 int l2 = (c>>7) & 0xF;
3405 int l3 = c & 0x7F;
3406 int i;
3407
3408#ifdef Py_UNICODE_WIDE
3409 if (c > 0xFFFF) {
3410 return -1;
3411 }
3412#endif
3413 if (c == 0)
3414 return 0;
3415 /* level 1*/
3416 i = map->level1[l1];
3417 if (i == 0xFF) {
3418 return -1;
3419 }
3420 /* level 2*/
3421 i = map->level23[16*i+l2];
3422 if (i == 0xFF) {
3423 return -1;
3424 }
3425 /* level 3 */
3426 i = map->level23[16*map->count2 + 128*i + l3];
3427 if (i == 0) {
3428 return -1;
3429 }
3430 return i;
3431}
3432
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003433/* Lookup the character ch in the mapping. If the character
3434 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003435 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003438 PyObject *w = PyInt_FromLong((long)c);
3439 PyObject *x;
3440
3441 if (w == NULL)
3442 return NULL;
3443 x = PyObject_GetItem(mapping, w);
3444 Py_DECREF(w);
3445 if (x == NULL) {
3446 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3447 /* No mapping found means: mapping is undefined. */
3448 PyErr_Clear();
3449 x = Py_None;
3450 Py_INCREF(x);
3451 return x;
3452 } else
3453 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003455 else if (x == Py_None)
3456 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 else if (PyInt_Check(x)) {
3458 long value = PyInt_AS_LONG(x);
3459 if (value < 0 || value > 255) {
3460 PyErr_SetString(PyExc_TypeError,
3461 "character mapping must be in range(256)");
3462 Py_DECREF(x);
3463 return NULL;
3464 }
3465 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003467 else if (PyString_Check(x))
3468 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003470 /* wrong return value */
3471 PyErr_SetString(PyExc_TypeError,
3472 "character mapping must return integer, None or str");
3473 Py_DECREF(x);
3474 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 }
3476}
3477
Martin v. Löwis3f767792006-06-04 19:36:28 +00003478static int
3479charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3480{
3481 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3482 /* exponentially overallocate to minimize reallocations */
3483 if (requiredsize < 2*outsize)
3484 requiredsize = 2*outsize;
3485 if (_PyString_Resize(outobj, requiredsize)) {
3486 return 0;
3487 }
3488 return 1;
3489}
3490
3491typedef enum charmapencode_result {
3492 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3493}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003494/* lookup the character, put the result in the output string and adjust
3495 various state variables. Reallocate the output string if not enough
3496 space is available. Return a new reference to the object that
3497 was put in the output buffer, or Py_None, if the mapping was undefined
3498 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003499 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003501charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003502 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003504 PyObject *rep;
3505 char *outstart;
3506 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507
Martin v. Löwis3f767792006-06-04 19:36:28 +00003508 if (mapping->ob_type == &EncodingMapType) {
3509 int res = encoding_map_lookup(c, mapping);
3510 Py_ssize_t requiredsize = *outpos+1;
3511 if (res == -1)
3512 return enc_FAILED;
3513 if (outsize<requiredsize)
3514 if (!charmapencode_resize(outobj, outpos, requiredsize))
3515 return enc_EXCEPTION;
3516 outstart = PyString_AS_STRING(*outobj);
3517 outstart[(*outpos)++] = (char)res;
3518 return enc_SUCCESS;
3519 }
3520
3521 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003523 return enc_EXCEPTION;
3524 else if (rep==Py_None) {
3525 Py_DECREF(rep);
3526 return enc_FAILED;
3527 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003528 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003529 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003530 if (outsize<requiredsize)
3531 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003533 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003535 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3537 }
3538 else {
3539 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003540 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3541 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003542 if (outsize<requiredsize)
3543 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003545 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003547 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 memcpy(outstart + *outpos, repchars, repsize);
3549 *outpos += repsize;
3550 }
3551 }
Georg Brandl9f167602006-06-04 21:46:16 +00003552 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003553 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554}
3555
3556/* handle an error in PyUnicode_EncodeCharmap
3557 Return 0 on success, -1 on error */
3558static
3559int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003560 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003562 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003563 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564{
3565 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003566 Py_ssize_t repsize;
3567 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 Py_UNICODE *uni2;
3569 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003570 Py_ssize_t collstartpos = *inpos;
3571 Py_ssize_t collendpos = *inpos+1;
3572 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 char *encoding = "charmap";
3574 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003575 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 /* find all unencodable characters */
3578 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003579 PyObject *rep;
3580 if (mapping->ob_type == &EncodingMapType) {
3581 int res = encoding_map_lookup(p[collendpos], mapping);
3582 if (res != -1)
3583 break;
3584 ++collendpos;
3585 continue;
3586 }
3587
3588 rep = charmapencode_lookup(p[collendpos], mapping);
3589 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003591 else if (rep!=Py_None) {
3592 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 break;
3594 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003595 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 ++collendpos;
3597 }
3598 /* cache callback name lookup
3599 * (if not done yet, i.e. it's the first error) */
3600 if (*known_errorHandler==-1) {
3601 if ((errors==NULL) || (!strcmp(errors, "strict")))
3602 *known_errorHandler = 1;
3603 else if (!strcmp(errors, "replace"))
3604 *known_errorHandler = 2;
3605 else if (!strcmp(errors, "ignore"))
3606 *known_errorHandler = 3;
3607 else if (!strcmp(errors, "xmlcharrefreplace"))
3608 *known_errorHandler = 4;
3609 else
3610 *known_errorHandler = 0;
3611 }
3612 switch (*known_errorHandler) {
3613 case 1: /* strict */
3614 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3615 return -1;
3616 case 2: /* replace */
3617 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3618 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003619 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 return -1;
3621 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003622 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3624 return -1;
3625 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003626 }
3627 /* fall through */
3628 case 3: /* ignore */
3629 *inpos = collendpos;
3630 break;
3631 case 4: /* xmlcharrefreplace */
3632 /* generate replacement (temporarily (mis)uses p) */
3633 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3634 char buffer[2+29+1+1];
3635 char *cp;
3636 sprintf(buffer, "&#%d;", (int)p[collpos]);
3637 for (cp = buffer; *cp; ++cp) {
3638 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003639 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003641 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3643 return -1;
3644 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003645 }
3646 }
3647 *inpos = collendpos;
3648 break;
3649 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003650 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003651 encoding, reason, p, size, exceptionObject,
3652 collstartpos, collendpos, &newpos);
3653 if (repunicode == NULL)
3654 return -1;
3655 /* generate replacement */
3656 repsize = PyUnicode_GET_SIZE(repunicode);
3657 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3658 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003659 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 return -1;
3661 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003662 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3665 return -1;
3666 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667 }
3668 *inpos = newpos;
3669 Py_DECREF(repunicode);
3670 }
3671 return 0;
3672}
3673
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003675 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003676 PyObject *mapping,
3677 const char *errors)
3678{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 /* output object */
3680 PyObject *res = NULL;
3681 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003682 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003684 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 PyObject *errorHandler = NULL;
3686 PyObject *exc = NULL;
3687 /* the following variable is used for caching string comparisons
3688 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3689 * 3=ignore, 4=xmlcharrefreplace */
3690 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691
3692 /* Default to Latin-1 */
3693 if (mapping == NULL)
3694 return PyUnicode_EncodeLatin1(p, size, errors);
3695
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 /* allocate enough for a simple encoding without
3697 replacements, if we need more, we'll resize */
3698 res = PyString_FromStringAndSize(NULL, size);
3699 if (res == NULL)
3700 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003701 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003704 while (inpos<size) {
3705 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003706 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3707 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003709 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 if (charmap_encoding_error(p, size, &inpos, mapping,
3711 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003712 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003713 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003714 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003717 else
3718 /* done with this character => adjust input position */
3719 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 /* Resize if we allocated to much */
3723 if (respos<PyString_GET_SIZE(res)) {
3724 if (_PyString_Resize(&res, respos))
3725 goto onError;
3726 }
3727 Py_XDECREF(exc);
3728 Py_XDECREF(errorHandler);
3729 return res;
3730
3731 onError:
3732 Py_XDECREF(res);
3733 Py_XDECREF(exc);
3734 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 return NULL;
3736}
3737
3738PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3739 PyObject *mapping)
3740{
3741 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3742 PyErr_BadArgument();
3743 return NULL;
3744 }
3745 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3746 PyUnicode_GET_SIZE(unicode),
3747 mapping,
3748 NULL);
3749}
3750
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003751/* create or adjust a UnicodeTranslateError */
3752static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003753 const Py_UNICODE *unicode, Py_ssize_t size,
3754 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003755 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003757 if (*exceptionObject == NULL) {
3758 *exceptionObject = PyUnicodeTranslateError_Create(
3759 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760 }
3761 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3763 goto onError;
3764 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3765 goto onError;
3766 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3767 goto onError;
3768 return;
3769 onError:
3770 Py_DECREF(*exceptionObject);
3771 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 }
3773}
3774
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775/* raises a UnicodeTranslateError */
3776static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003777 const Py_UNICODE *unicode, Py_ssize_t size,
3778 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003779 const char *reason)
3780{
3781 make_translate_exception(exceptionObject,
3782 unicode, size, startpos, endpos, reason);
3783 if (*exceptionObject != NULL)
3784 PyCodec_StrictErrors(*exceptionObject);
3785}
3786
3787/* error handling callback helper:
3788 build arguments, call the callback and check the arguments,
3789 put the result into newpos and return the replacement string, which
3790 has to be freed by the caller */
3791static PyObject *unicode_translate_call_errorhandler(const char *errors,
3792 PyObject **errorHandler,
3793 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003794 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3795 Py_ssize_t startpos, Py_ssize_t endpos,
3796 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003797{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003798 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003799
Martin v. Löwis412fb672006-04-13 06:34:32 +00003800 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003801 PyObject *restuple;
3802 PyObject *resunicode;
3803
3804 if (*errorHandler == NULL) {
3805 *errorHandler = PyCodec_LookupError(errors);
3806 if (*errorHandler == NULL)
3807 return NULL;
3808 }
3809
3810 make_translate_exception(exceptionObject,
3811 unicode, size, startpos, endpos, reason);
3812 if (*exceptionObject == NULL)
3813 return NULL;
3814
3815 restuple = PyObject_CallFunctionObjArgs(
3816 *errorHandler, *exceptionObject, NULL);
3817 if (restuple == NULL)
3818 return NULL;
3819 if (!PyTuple_Check(restuple)) {
3820 PyErr_Format(PyExc_TypeError, &argparse[4]);
3821 Py_DECREF(restuple);
3822 return NULL;
3823 }
3824 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003825 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003826 Py_DECREF(restuple);
3827 return NULL;
3828 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003829 if (i_newpos<0)
3830 *newpos = size+i_newpos;
3831 else
3832 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003833 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003834 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003835 Py_DECREF(restuple);
3836 return NULL;
3837 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838 Py_INCREF(resunicode);
3839 Py_DECREF(restuple);
3840 return resunicode;
3841}
3842
3843/* Lookup the character ch in the mapping and put the result in result,
3844 which must be decrefed by the caller.
3845 Return 0 on success, -1 on error */
3846static
3847int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3848{
3849 PyObject *w = PyInt_FromLong((long)c);
3850 PyObject *x;
3851
3852 if (w == NULL)
3853 return -1;
3854 x = PyObject_GetItem(mapping, w);
3855 Py_DECREF(w);
3856 if (x == NULL) {
3857 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3858 /* No mapping found means: use 1:1 mapping. */
3859 PyErr_Clear();
3860 *result = NULL;
3861 return 0;
3862 } else
3863 return -1;
3864 }
3865 else if (x == Py_None) {
3866 *result = x;
3867 return 0;
3868 }
3869 else if (PyInt_Check(x)) {
3870 long value = PyInt_AS_LONG(x);
3871 long max = PyUnicode_GetMax();
3872 if (value < 0 || value > max) {
3873 PyErr_Format(PyExc_TypeError,
3874 "character mapping must be in range(0x%lx)", max+1);
3875 Py_DECREF(x);
3876 return -1;
3877 }
3878 *result = x;
3879 return 0;
3880 }
3881 else if (PyUnicode_Check(x)) {
3882 *result = x;
3883 return 0;
3884 }
3885 else {
3886 /* wrong return value */
3887 PyErr_SetString(PyExc_TypeError,
3888 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003889 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890 return -1;
3891 }
3892}
3893/* ensure that *outobj is at least requiredsize characters long,
3894if not reallocate and adjust various state variables.
3895Return 0 on success, -1 on error */
3896static
Walter Dörwald4894c302003-10-24 14:25:28 +00003897int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003898 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003900 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003901 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003902 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003903 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003904 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003905 if (requiredsize < 2 * oldsize)
3906 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003907 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908 return -1;
3909 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003910 }
3911 return 0;
3912}
3913/* lookup the character, put the result in the output string and adjust
3914 various state variables. Return a new reference to the object that
3915 was put in the output buffer in *result, or Py_None, if the mapping was
3916 undefined (in which case no character was written).
3917 The called must decref result.
3918 Return 0 on success, -1 on error. */
3919static
Walter Dörwald4894c302003-10-24 14:25:28 +00003920int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003921 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003922 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923{
Walter Dörwald4894c302003-10-24 14:25:28 +00003924 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925 return -1;
3926 if (*res==NULL) {
3927 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003928 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 }
3930 else if (*res==Py_None)
3931 ;
3932 else if (PyInt_Check(*res)) {
3933 /* no overflow check, because we know that the space is enough */
3934 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3935 }
3936 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003937 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003938 if (repsize==1) {
3939 /* no overflow check, because we know that the space is enough */
3940 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3941 }
3942 else if (repsize!=0) {
3943 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003944 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003945 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003946 repsize - 1;
3947 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 return -1;
3949 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3950 *outp += repsize;
3951 }
3952 }
3953 else
3954 return -1;
3955 return 0;
3956}
3957
3958PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003959 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 PyObject *mapping,
3961 const char *errors)
3962{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003963 /* output object */
3964 PyObject *res = NULL;
3965 /* pointers to the beginning and end+1 of input */
3966 const Py_UNICODE *startp = p;
3967 const Py_UNICODE *endp = p + size;
3968 /* pointer into the output */
3969 Py_UNICODE *str;
3970 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003971 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972 char *reason = "character maps to <undefined>";
3973 PyObject *errorHandler = NULL;
3974 PyObject *exc = NULL;
3975 /* the following variable is used for caching string comparisons
3976 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3977 * 3=ignore, 4=xmlcharrefreplace */
3978 int known_errorHandler = -1;
3979
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 if (mapping == NULL) {
3981 PyErr_BadArgument();
3982 return NULL;
3983 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984
3985 /* allocate enough for a simple 1:1 translation without
3986 replacements, if we need more, we'll resize */
3987 res = PyUnicode_FromUnicode(NULL, size);
3988 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003989 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991 return res;
3992 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003994 while (p<endp) {
3995 /* try to encode it */
3996 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003997 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003998 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999 goto onError;
4000 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004001 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004002 if (x!=Py_None) /* it worked => adjust input pointer */
4003 ++p;
4004 else { /* untranslatable character */
4005 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004006 Py_ssize_t repsize;
4007 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004008 Py_UNICODE *uni2;
4009 /* startpos for collecting untranslatable chars */
4010 const Py_UNICODE *collstart = p;
4011 const Py_UNICODE *collend = p+1;
4012 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004013
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 /* find all untranslatable characters */
4015 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004016 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 goto onError;
4018 Py_XDECREF(x);
4019 if (x!=Py_None)
4020 break;
4021 ++collend;
4022 }
4023 /* cache callback name lookup
4024 * (if not done yet, i.e. it's the first error) */
4025 if (known_errorHandler==-1) {
4026 if ((errors==NULL) || (!strcmp(errors, "strict")))
4027 known_errorHandler = 1;
4028 else if (!strcmp(errors, "replace"))
4029 known_errorHandler = 2;
4030 else if (!strcmp(errors, "ignore"))
4031 known_errorHandler = 3;
4032 else if (!strcmp(errors, "xmlcharrefreplace"))
4033 known_errorHandler = 4;
4034 else
4035 known_errorHandler = 0;
4036 }
4037 switch (known_errorHandler) {
4038 case 1: /* strict */
4039 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4040 goto onError;
4041 case 2: /* replace */
4042 /* No need to check for space, this is a 1:1 replacement */
4043 for (coll = collstart; coll<collend; ++coll)
4044 *str++ = '?';
4045 /* fall through */
4046 case 3: /* ignore */
4047 p = collend;
4048 break;
4049 case 4: /* xmlcharrefreplace */
4050 /* generate replacement (temporarily (mis)uses p) */
4051 for (p = collstart; p < collend; ++p) {
4052 char buffer[2+29+1+1];
4053 char *cp;
4054 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004055 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4057 goto onError;
4058 for (cp = buffer; *cp; ++cp)
4059 *str++ = *cp;
4060 }
4061 p = collend;
4062 break;
4063 default:
4064 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4065 reason, startp, size, &exc,
4066 collstart-startp, collend-startp, &newpos);
4067 if (repunicode == NULL)
4068 goto onError;
4069 /* generate replacement */
4070 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004071 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4073 Py_DECREF(repunicode);
4074 goto onError;
4075 }
4076 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4077 *str++ = *uni2;
4078 p = startp + newpos;
4079 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 }
4081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 /* Resize if we allocated to much */
4084 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004085 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004086 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004087 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004088 }
4089 Py_XDECREF(exc);
4090 Py_XDECREF(errorHandler);
4091 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093 onError:
4094 Py_XDECREF(res);
4095 Py_XDECREF(exc);
4096 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097 return NULL;
4098}
4099
4100PyObject *PyUnicode_Translate(PyObject *str,
4101 PyObject *mapping,
4102 const char *errors)
4103{
4104 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004105
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106 str = PyUnicode_FromObject(str);
4107 if (str == NULL)
4108 goto onError;
4109 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4110 PyUnicode_GET_SIZE(str),
4111 mapping,
4112 errors);
4113 Py_DECREF(str);
4114 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004115
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116 onError:
4117 Py_XDECREF(str);
4118 return NULL;
4119}
Tim Petersced69f82003-09-16 20:30:58 +00004120
Guido van Rossum9e896b32000-04-05 20:11:21 +00004121/* --- Decimal Encoder ---------------------------------------------------- */
4122
4123int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004124 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004125 char *output,
4126 const char *errors)
4127{
4128 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 PyObject *errorHandler = NULL;
4130 PyObject *exc = NULL;
4131 const char *encoding = "decimal";
4132 const char *reason = "invalid decimal Unicode string";
4133 /* the following variable is used for caching string comparisons
4134 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4135 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004136
4137 if (output == NULL) {
4138 PyErr_BadArgument();
4139 return -1;
4140 }
4141
4142 p = s;
4143 end = s + length;
4144 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004146 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004148 Py_ssize_t repsize;
4149 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 Py_UNICODE *uni2;
4151 Py_UNICODE *collstart;
4152 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004153
Guido van Rossum9e896b32000-04-05 20:11:21 +00004154 if (Py_UNICODE_ISSPACE(ch)) {
4155 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004157 continue;
4158 }
4159 decimal = Py_UNICODE_TODECIMAL(ch);
4160 if (decimal >= 0) {
4161 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004163 continue;
4164 }
Guido van Rossumba477042000-04-06 18:18:10 +00004165 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004166 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004168 continue;
4169 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 /* All other characters are considered unencodable */
4171 collstart = p;
4172 collend = p+1;
4173 while (collend < end) {
4174 if ((0 < *collend && *collend < 256) ||
4175 !Py_UNICODE_ISSPACE(*collend) ||
4176 Py_UNICODE_TODECIMAL(*collend))
4177 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004178 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 /* cache callback name lookup
4180 * (if not done yet, i.e. it's the first error) */
4181 if (known_errorHandler==-1) {
4182 if ((errors==NULL) || (!strcmp(errors, "strict")))
4183 known_errorHandler = 1;
4184 else if (!strcmp(errors, "replace"))
4185 known_errorHandler = 2;
4186 else if (!strcmp(errors, "ignore"))
4187 known_errorHandler = 3;
4188 else if (!strcmp(errors, "xmlcharrefreplace"))
4189 known_errorHandler = 4;
4190 else
4191 known_errorHandler = 0;
4192 }
4193 switch (known_errorHandler) {
4194 case 1: /* strict */
4195 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4196 goto onError;
4197 case 2: /* replace */
4198 for (p = collstart; p < collend; ++p)
4199 *output++ = '?';
4200 /* fall through */
4201 case 3: /* ignore */
4202 p = collend;
4203 break;
4204 case 4: /* xmlcharrefreplace */
4205 /* generate replacement (temporarily (mis)uses p) */
4206 for (p = collstart; p < collend; ++p)
4207 output += sprintf(output, "&#%d;", (int)*p);
4208 p = collend;
4209 break;
4210 default:
4211 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4212 encoding, reason, s, length, &exc,
4213 collstart-s, collend-s, &newpos);
4214 if (repunicode == NULL)
4215 goto onError;
4216 /* generate replacement */
4217 repsize = PyUnicode_GET_SIZE(repunicode);
4218 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4219 Py_UNICODE ch = *uni2;
4220 if (Py_UNICODE_ISSPACE(ch))
4221 *output++ = ' ';
4222 else {
4223 decimal = Py_UNICODE_TODECIMAL(ch);
4224 if (decimal >= 0)
4225 *output++ = '0' + decimal;
4226 else if (0 < ch && ch < 256)
4227 *output++ = (char)ch;
4228 else {
4229 Py_DECREF(repunicode);
4230 raise_encode_exception(&exc, encoding,
4231 s, length, collstart-s, collend-s, reason);
4232 goto onError;
4233 }
4234 }
4235 }
4236 p = s + newpos;
4237 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004238 }
4239 }
4240 /* 0-terminate the output string */
4241 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242 Py_XDECREF(exc);
4243 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004244 return 0;
4245
4246 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 Py_XDECREF(exc);
4248 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004249 return -1;
4250}
4251
Guido van Rossumd57fd912000-03-10 22:53:23 +00004252/* --- Helpers ------------------------------------------------------------ */
4253
Fredrik Lundha50d2012006-05-26 17:04:58 +00004254#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004255
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004256#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004257#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004258#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004259
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004260Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004261STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4262{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004263 if (str[0] != other[0])
4264 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004265 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4266}
4267
Fredrik Lundhb9479482006-05-26 17:22:38 +00004268#define STRINGLIB_EMPTY unicode_empty
4269
Fredrik Lundha50d2012006-05-26 17:04:58 +00004270#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004271
4272#include "stringlib/count.h"
4273#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004274#include "stringlib/partition.h"
4275
Fredrik Lundhc8162812006-05-26 19:33:03 +00004276/* helper macro to fixup start/end slice values */
4277#define FIX_START_END(obj) \
4278 if (start < 0) \
4279 start += (obj)->length; \
4280 if (start < 0) \
4281 start = 0; \
4282 if (end > (obj)->length) \
4283 end = (obj)->length; \
4284 if (end < 0) \
4285 end += (obj)->length; \
4286 if (end < 0) \
4287 end = 0;
4288
Martin v. Löwis18e16552006-02-15 17:27:45 +00004289Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004290 PyObject *substr,
4291 Py_ssize_t start,
4292 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004294 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004295 PyUnicodeObject* str_obj;
4296 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004297
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004298 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4299 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004301 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4302 if (!sub_obj) {
4303 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304 return -1;
4305 }
Tim Petersced69f82003-09-16 20:30:58 +00004306
Fredrik Lundhc8162812006-05-26 19:33:03 +00004307 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004308
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004309 result = stringlib_count(
4310 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4311 );
4312
4313 Py_DECREF(sub_obj);
4314 Py_DECREF(str_obj);
4315
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316 return result;
4317}
4318
Martin v. Löwis18e16552006-02-15 17:27:45 +00004319Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004320 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004321 Py_ssize_t start,
4322 Py_ssize_t end,
4323 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004325 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004326
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004327 str = PyUnicode_FromObject(str);
4328 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004329 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004330 sub = PyUnicode_FromObject(sub);
4331 if (!sub) {
4332 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004333 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 }
Tim Petersced69f82003-09-16 20:30:58 +00004335
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004336 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004337 result = stringlib_find_slice(
4338 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4339 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4340 start, end
4341 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004342 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004343 result = stringlib_rfind_slice(
4344 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4345 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4346 start, end
4347 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004348
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004349 Py_DECREF(str);
4350 Py_DECREF(sub);
4351
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352 return result;
4353}
4354
Tim Petersced69f82003-09-16 20:30:58 +00004355static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356int tailmatch(PyUnicodeObject *self,
4357 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004358 Py_ssize_t start,
4359 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360 int direction)
4361{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362 if (substring->length == 0)
4363 return 1;
4364
Fredrik Lundhc8162812006-05-26 19:33:03 +00004365 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004366
4367 end -= substring->length;
4368 if (end < start)
4369 return 0;
4370
4371 if (direction > 0) {
4372 if (Py_UNICODE_MATCH(self, end, substring))
4373 return 1;
4374 } else {
4375 if (Py_UNICODE_MATCH(self, start, substring))
4376 return 1;
4377 }
4378
4379 return 0;
4380}
4381
Martin v. Löwis18e16552006-02-15 17:27:45 +00004382Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004384 Py_ssize_t start,
4385 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386 int direction)
4387{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004388 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004389
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 str = PyUnicode_FromObject(str);
4391 if (str == NULL)
4392 return -1;
4393 substr = PyUnicode_FromObject(substr);
4394 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004395 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 return -1;
4397 }
Tim Petersced69f82003-09-16 20:30:58 +00004398
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 result = tailmatch((PyUnicodeObject *)str,
4400 (PyUnicodeObject *)substr,
4401 start, end, direction);
4402 Py_DECREF(str);
4403 Py_DECREF(substr);
4404 return result;
4405}
4406
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407/* Apply fixfct filter to the Unicode object self and return a
4408 reference to the modified object */
4409
Tim Petersced69f82003-09-16 20:30:58 +00004410static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411PyObject *fixup(PyUnicodeObject *self,
4412 int (*fixfct)(PyUnicodeObject *s))
4413{
4414
4415 PyUnicodeObject *u;
4416
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004417 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418 if (u == NULL)
4419 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004420
4421 Py_UNICODE_COPY(u->str, self->str, self->length);
4422
Tim Peters7a29bd52001-09-12 03:03:31 +00004423 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424 /* fixfct should return TRUE if it modified the buffer. If
4425 FALSE, return a reference to the original buffer instead
4426 (to save space, not time) */
4427 Py_INCREF(self);
4428 Py_DECREF(u);
4429 return (PyObject*) self;
4430 }
4431 return (PyObject*) u;
4432}
4433
Tim Petersced69f82003-09-16 20:30:58 +00004434static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435int fixupper(PyUnicodeObject *self)
4436{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004437 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438 Py_UNICODE *s = self->str;
4439 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004440
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 while (len-- > 0) {
4442 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004443
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 ch = Py_UNICODE_TOUPPER(*s);
4445 if (ch != *s) {
4446 status = 1;
4447 *s = ch;
4448 }
4449 s++;
4450 }
4451
4452 return status;
4453}
4454
Tim Petersced69f82003-09-16 20:30:58 +00004455static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456int fixlower(PyUnicodeObject *self)
4457{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004458 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 Py_UNICODE *s = self->str;
4460 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004461
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 while (len-- > 0) {
4463 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004464
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 ch = Py_UNICODE_TOLOWER(*s);
4466 if (ch != *s) {
4467 status = 1;
4468 *s = ch;
4469 }
4470 s++;
4471 }
4472
4473 return status;
4474}
4475
Tim Petersced69f82003-09-16 20:30:58 +00004476static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477int fixswapcase(PyUnicodeObject *self)
4478{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004479 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 Py_UNICODE *s = self->str;
4481 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004482
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483 while (len-- > 0) {
4484 if (Py_UNICODE_ISUPPER(*s)) {
4485 *s = Py_UNICODE_TOLOWER(*s);
4486 status = 1;
4487 } else if (Py_UNICODE_ISLOWER(*s)) {
4488 *s = Py_UNICODE_TOUPPER(*s);
4489 status = 1;
4490 }
4491 s++;
4492 }
4493
4494 return status;
4495}
4496
Tim Petersced69f82003-09-16 20:30:58 +00004497static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498int fixcapitalize(PyUnicodeObject *self)
4499{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004500 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004501 Py_UNICODE *s = self->str;
4502 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004503
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004504 if (len == 0)
4505 return 0;
4506 if (Py_UNICODE_ISLOWER(*s)) {
4507 *s = Py_UNICODE_TOUPPER(*s);
4508 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004510 s++;
4511 while (--len > 0) {
4512 if (Py_UNICODE_ISUPPER(*s)) {
4513 *s = Py_UNICODE_TOLOWER(*s);
4514 status = 1;
4515 }
4516 s++;
4517 }
4518 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519}
4520
4521static
4522int fixtitle(PyUnicodeObject *self)
4523{
4524 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4525 register Py_UNICODE *e;
4526 int previous_is_cased;
4527
4528 /* Shortcut for single character strings */
4529 if (PyUnicode_GET_SIZE(self) == 1) {
4530 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4531 if (*p != ch) {
4532 *p = ch;
4533 return 1;
4534 }
4535 else
4536 return 0;
4537 }
Tim Petersced69f82003-09-16 20:30:58 +00004538
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 e = p + PyUnicode_GET_SIZE(self);
4540 previous_is_cased = 0;
4541 for (; p < e; p++) {
4542 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004543
Guido van Rossumd57fd912000-03-10 22:53:23 +00004544 if (previous_is_cased)
4545 *p = Py_UNICODE_TOLOWER(ch);
4546 else
4547 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004548
4549 if (Py_UNICODE_ISLOWER(ch) ||
4550 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551 Py_UNICODE_ISTITLE(ch))
4552 previous_is_cased = 1;
4553 else
4554 previous_is_cased = 0;
4555 }
4556 return 1;
4557}
4558
Tim Peters8ce9f162004-08-27 01:49:32 +00004559PyObject *
4560PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561{
Tim Peters8ce9f162004-08-27 01:49:32 +00004562 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004563 const Py_UNICODE blank = ' ';
4564 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004565 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004566 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004567 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4568 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004569 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4570 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004571 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004572 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004573 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574
Tim Peters05eba1f2004-08-27 21:32:02 +00004575 fseq = PySequence_Fast(seq, "");
4576 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004577 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004578 }
4579
Tim Peters91879ab2004-08-27 22:35:44 +00004580 /* Grrrr. A codec may be invoked to convert str objects to
4581 * Unicode, and so it's possible to call back into Python code
4582 * during PyUnicode_FromObject(), and so it's possible for a sick
4583 * codec to change the size of fseq (if seq is a list). Therefore
4584 * we have to keep refetching the size -- can't assume seqlen
4585 * is invariant.
4586 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004587 seqlen = PySequence_Fast_GET_SIZE(fseq);
4588 /* If empty sequence, return u"". */
4589 if (seqlen == 0) {
4590 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4591 goto Done;
4592 }
4593 /* If singleton sequence with an exact Unicode, return that. */
4594 if (seqlen == 1) {
4595 item = PySequence_Fast_GET_ITEM(fseq, 0);
4596 if (PyUnicode_CheckExact(item)) {
4597 Py_INCREF(item);
4598 res = (PyUnicodeObject *)item;
4599 goto Done;
4600 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004601 }
4602
Tim Peters05eba1f2004-08-27 21:32:02 +00004603 /* At least two items to join, or one that isn't exact Unicode. */
4604 if (seqlen > 1) {
4605 /* Set up sep and seplen -- they're needed. */
4606 if (separator == NULL) {
4607 sep = &blank;
4608 seplen = 1;
4609 }
4610 else {
4611 internal_separator = PyUnicode_FromObject(separator);
4612 if (internal_separator == NULL)
4613 goto onError;
4614 sep = PyUnicode_AS_UNICODE(internal_separator);
4615 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004616 /* In case PyUnicode_FromObject() mutated seq. */
4617 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004618 }
4619 }
4620
4621 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004622 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004623 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004624 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004625 res_p = PyUnicode_AS_UNICODE(res);
4626 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004627
Tim Peters05eba1f2004-08-27 21:32:02 +00004628 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004629 Py_ssize_t itemlen;
4630 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004631
4632 item = PySequence_Fast_GET_ITEM(fseq, i);
4633 /* Convert item to Unicode. */
4634 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4635 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004636 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004637 " %.80s found",
4638 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004639 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004640 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004641 item = PyUnicode_FromObject(item);
4642 if (item == NULL)
4643 goto onError;
4644 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004645
Tim Peters91879ab2004-08-27 22:35:44 +00004646 /* In case PyUnicode_FromObject() mutated seq. */
4647 seqlen = PySequence_Fast_GET_SIZE(fseq);
4648
Tim Peters8ce9f162004-08-27 01:49:32 +00004649 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004651 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004652 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004653 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004654 if (i < seqlen - 1) {
4655 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004656 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004657 goto Overflow;
4658 }
4659 if (new_res_used > res_alloc) {
4660 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004661 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004662 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004663 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004664 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004665 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004666 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004667 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004669 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004670 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004672
4673 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004674 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004675 res_p += itemlen;
4676 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004677 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004678 res_p += seplen;
4679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004681 res_used = new_res_used;
4682 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004683
Tim Peters05eba1f2004-08-27 21:32:02 +00004684 /* Shrink res to match the used area; this probably can't fail,
4685 * but it's cheap to check.
4686 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004687 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004688 goto onError;
4689
4690 Done:
4691 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004692 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693 return (PyObject *)res;
4694
Tim Peters8ce9f162004-08-27 01:49:32 +00004695 Overflow:
4696 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004697 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004698 Py_DECREF(item);
4699 /* fall through */
4700
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004702 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004703 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004704 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705 return NULL;
4706}
4707
Tim Petersced69f82003-09-16 20:30:58 +00004708static
4709PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004710 Py_ssize_t left,
4711 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712 Py_UNICODE fill)
4713{
4714 PyUnicodeObject *u;
4715
4716 if (left < 0)
4717 left = 0;
4718 if (right < 0)
4719 right = 0;
4720
Tim Peters7a29bd52001-09-12 03:03:31 +00004721 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722 Py_INCREF(self);
4723 return self;
4724 }
4725
4726 u = _PyUnicode_New(left + self->length + right);
4727 if (u) {
4728 if (left)
4729 Py_UNICODE_FILL(u->str, fill, left);
4730 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4731 if (right)
4732 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4733 }
4734
4735 return u;
4736}
4737
4738#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004739 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 if (!str) \
4741 goto onError; \
4742 if (PyList_Append(list, str)) { \
4743 Py_DECREF(str); \
4744 goto onError; \
4745 } \
4746 else \
4747 Py_DECREF(str);
4748
4749static
4750PyObject *split_whitespace(PyUnicodeObject *self,
4751 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004752 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004754 register Py_ssize_t i;
4755 register Py_ssize_t j;
4756 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757 PyObject *str;
4758
4759 for (i = j = 0; i < len; ) {
4760 /* find a token */
4761 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4762 i++;
4763 j = i;
4764 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4765 i++;
4766 if (j < i) {
4767 if (maxcount-- <= 0)
4768 break;
4769 SPLIT_APPEND(self->str, j, i);
4770 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4771 i++;
4772 j = i;
4773 }
4774 }
4775 if (j < len) {
4776 SPLIT_APPEND(self->str, j, len);
4777 }
4778 return list;
4779
4780 onError:
4781 Py_DECREF(list);
4782 return NULL;
4783}
4784
4785PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004786 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004788 register Py_ssize_t i;
4789 register Py_ssize_t j;
4790 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 PyObject *list;
4792 PyObject *str;
4793 Py_UNICODE *data;
4794
4795 string = PyUnicode_FromObject(string);
4796 if (string == NULL)
4797 return NULL;
4798 data = PyUnicode_AS_UNICODE(string);
4799 len = PyUnicode_GET_SIZE(string);
4800
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801 list = PyList_New(0);
4802 if (!list)
4803 goto onError;
4804
4805 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004806 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004807
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004809 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811
4812 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004813 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 if (i < len) {
4815 if (data[i] == '\r' && i + 1 < len &&
4816 data[i+1] == '\n')
4817 i += 2;
4818 else
4819 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004820 if (keepends)
4821 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 }
Guido van Rossum86662912000-04-11 15:38:46 +00004823 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 j = i;
4825 }
4826 if (j < len) {
4827 SPLIT_APPEND(data, j, len);
4828 }
4829
4830 Py_DECREF(string);
4831 return list;
4832
4833 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004834 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835 Py_DECREF(string);
4836 return NULL;
4837}
4838
Tim Petersced69f82003-09-16 20:30:58 +00004839static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840PyObject *split_char(PyUnicodeObject *self,
4841 PyObject *list,
4842 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004843 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004845 register Py_ssize_t i;
4846 register Py_ssize_t j;
4847 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 PyObject *str;
4849
4850 for (i = j = 0; i < len; ) {
4851 if (self->str[i] == ch) {
4852 if (maxcount-- <= 0)
4853 break;
4854 SPLIT_APPEND(self->str, j, i);
4855 i = j = i + 1;
4856 } else
4857 i++;
4858 }
4859 if (j <= len) {
4860 SPLIT_APPEND(self->str, j, len);
4861 }
4862 return list;
4863
4864 onError:
4865 Py_DECREF(list);
4866 return NULL;
4867}
4868
Tim Petersced69f82003-09-16 20:30:58 +00004869static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870PyObject *split_substring(PyUnicodeObject *self,
4871 PyObject *list,
4872 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004873 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004875 register Py_ssize_t i;
4876 register Py_ssize_t j;
4877 Py_ssize_t len = self->length;
4878 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 PyObject *str;
4880
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004881 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 if (Py_UNICODE_MATCH(self, i, substring)) {
4883 if (maxcount-- <= 0)
4884 break;
4885 SPLIT_APPEND(self->str, j, i);
4886 i = j = i + sublen;
4887 } else
4888 i++;
4889 }
4890 if (j <= len) {
4891 SPLIT_APPEND(self->str, j, len);
4892 }
4893 return list;
4894
4895 onError:
4896 Py_DECREF(list);
4897 return NULL;
4898}
4899
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004900static
4901PyObject *rsplit_whitespace(PyUnicodeObject *self,
4902 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004903 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004904{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004905 register Py_ssize_t i;
4906 register Py_ssize_t j;
4907 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004908 PyObject *str;
4909
4910 for (i = j = len - 1; i >= 0; ) {
4911 /* find a token */
4912 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4913 i--;
4914 j = i;
4915 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4916 i--;
4917 if (j > i) {
4918 if (maxcount-- <= 0)
4919 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004920 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004921 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4922 i--;
4923 j = i;
4924 }
4925 }
4926 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004927 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004928 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004929 if (PyList_Reverse(list) < 0)
4930 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004931 return list;
4932
4933 onError:
4934 Py_DECREF(list);
4935 return NULL;
4936}
4937
4938static
4939PyObject *rsplit_char(PyUnicodeObject *self,
4940 PyObject *list,
4941 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004942 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004943{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004944 register Py_ssize_t i;
4945 register Py_ssize_t j;
4946 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004947 PyObject *str;
4948
4949 for (i = j = len - 1; i >= 0; ) {
4950 if (self->str[i] == ch) {
4951 if (maxcount-- <= 0)
4952 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004953 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004954 j = i = i - 1;
4955 } else
4956 i--;
4957 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004958 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004959 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004960 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004961 if (PyList_Reverse(list) < 0)
4962 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004963 return list;
4964
4965 onError:
4966 Py_DECREF(list);
4967 return NULL;
4968}
4969
4970static
4971PyObject *rsplit_substring(PyUnicodeObject *self,
4972 PyObject *list,
4973 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004974 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004975{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004976 register Py_ssize_t i;
4977 register Py_ssize_t j;
4978 Py_ssize_t len = self->length;
4979 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004980 PyObject *str;
4981
4982 for (i = len - sublen, j = len; i >= 0; ) {
4983 if (Py_UNICODE_MATCH(self, i, substring)) {
4984 if (maxcount-- <= 0)
4985 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004986 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004987 j = i;
4988 i -= sublen;
4989 } else
4990 i--;
4991 }
4992 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004993 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004994 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004995 if (PyList_Reverse(list) < 0)
4996 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004997 return list;
4998
4999 onError:
5000 Py_DECREF(list);
5001 return NULL;
5002}
5003
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004#undef SPLIT_APPEND
5005
5006static
5007PyObject *split(PyUnicodeObject *self,
5008 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005009 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010{
5011 PyObject *list;
5012
5013 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005014 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015
5016 list = PyList_New(0);
5017 if (!list)
5018 return NULL;
5019
5020 if (substring == NULL)
5021 return split_whitespace(self,list,maxcount);
5022
5023 else if (substring->length == 1)
5024 return split_char(self,list,substring->str[0],maxcount);
5025
5026 else if (substring->length == 0) {
5027 Py_DECREF(list);
5028 PyErr_SetString(PyExc_ValueError, "empty separator");
5029 return NULL;
5030 }
5031 else
5032 return split_substring(self,list,substring,maxcount);
5033}
5034
Tim Petersced69f82003-09-16 20:30:58 +00005035static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005036PyObject *rsplit(PyUnicodeObject *self,
5037 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005038 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005039{
5040 PyObject *list;
5041
5042 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005043 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005044
5045 list = PyList_New(0);
5046 if (!list)
5047 return NULL;
5048
5049 if (substring == NULL)
5050 return rsplit_whitespace(self,list,maxcount);
5051
5052 else if (substring->length == 1)
5053 return rsplit_char(self,list,substring->str[0],maxcount);
5054
5055 else if (substring->length == 0) {
5056 Py_DECREF(list);
5057 PyErr_SetString(PyExc_ValueError, "empty separator");
5058 return NULL;
5059 }
5060 else
5061 return rsplit_substring(self,list,substring,maxcount);
5062}
5063
5064static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065PyObject *replace(PyUnicodeObject *self,
5066 PyUnicodeObject *str1,
5067 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005068 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069{
5070 PyUnicodeObject *u;
5071
5072 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005073 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074
Fredrik Lundh347ee272006-05-24 16:35:18 +00005075 if (str1->length == str2->length) {
5076 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005077 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005078 if (str1->length == 1) {
5079 /* replace characters */
5080 Py_UNICODE u1, u2;
5081 if (!findchar(self->str, self->length, str1->str[0]))
5082 goto nothing;
5083 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5084 if (!u)
5085 return NULL;
5086 Py_UNICODE_COPY(u->str, self->str, self->length);
5087 u1 = str1->str[0];
5088 u2 = str2->str[0];
5089 for (i = 0; i < u->length; i++)
5090 if (u->str[i] == u1) {
5091 if (--maxcount < 0)
5092 break;
5093 u->str[i] = u2;
5094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005096 i = fastsearch(
5097 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005099 if (i < 0)
5100 goto nothing;
5101 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5102 if (!u)
5103 return NULL;
5104 Py_UNICODE_COPY(u->str, self->str, self->length);
5105 while (i <= self->length - str1->length)
5106 if (Py_UNICODE_MATCH(self, i, str1)) {
5107 if (--maxcount < 0)
5108 break;
5109 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5110 i += str1->length;
5111 } else
5112 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005115
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005116 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005117 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 Py_UNICODE *p;
5119
5120 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005121 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 if (n > maxcount)
5123 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005124 if (n == 0)
5125 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005126 /* new_size = self->length + n * (str2->length - str1->length)); */
5127 delta = (str2->length - str1->length);
5128 if (delta == 0) {
5129 new_size = self->length;
5130 } else {
5131 product = n * (str2->length - str1->length);
5132 if ((product / (str2->length - str1->length)) != n) {
5133 PyErr_SetString(PyExc_OverflowError,
5134 "replace string is too long");
5135 return NULL;
5136 }
5137 new_size = self->length + product;
5138 if (new_size < 0) {
5139 PyErr_SetString(PyExc_OverflowError,
5140 "replace string is too long");
5141 return NULL;
5142 }
5143 }
5144 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005145 if (!u)
5146 return NULL;
5147 i = 0;
5148 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005149 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005150 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005151 while (n-- > 0) {
5152 /* look for next match */
5153 j = i;
5154 while (j <= e) {
5155 if (Py_UNICODE_MATCH(self, j, str1))
5156 break;
5157 j++;
5158 }
5159 if (j > i) {
5160 if (j > e)
5161 break;
5162 /* copy unchanged part [i:j] */
5163 Py_UNICODE_COPY(p, self->str+i, j-i);
5164 p += j - i;
5165 }
5166 /* copy substitution string */
5167 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005168 Py_UNICODE_COPY(p, str2->str, str2->length);
5169 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005170 }
5171 i = j + str1->length;
5172 }
5173 if (i < self->length)
5174 /* copy tail [i:] */
5175 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005176 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005177 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005178 while (n > 0) {
5179 Py_UNICODE_COPY(p, str2->str, str2->length);
5180 p += str2->length;
5181 if (--n <= 0)
5182 break;
5183 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005185 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 }
5187 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005189
5190nothing:
5191 /* nothing to replace; return original string (when possible) */
5192 if (PyUnicode_CheckExact(self)) {
5193 Py_INCREF(self);
5194 return (PyObject *) self;
5195 }
5196 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197}
5198
5199/* --- Unicode Object Methods --------------------------------------------- */
5200
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005201PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202"S.title() -> unicode\n\
5203\n\
5204Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005205characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206
5207static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005208unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 return fixup(self, fixtitle);
5211}
5212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005213PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214"S.capitalize() -> unicode\n\
5215\n\
5216Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005217have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218
5219static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005220unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 return fixup(self, fixcapitalize);
5223}
5224
5225#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005226PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227"S.capwords() -> unicode\n\
5228\n\
5229Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005230normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231
5232static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005233unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234{
5235 PyObject *list;
5236 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005237 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239 /* Split into words */
5240 list = split(self, NULL, -1);
5241 if (!list)
5242 return NULL;
5243
5244 /* Capitalize each word */
5245 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5246 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5247 fixcapitalize);
5248 if (item == NULL)
5249 goto onError;
5250 Py_DECREF(PyList_GET_ITEM(list, i));
5251 PyList_SET_ITEM(list, i, item);
5252 }
5253
5254 /* Join the words to form a new string */
5255 item = PyUnicode_Join(NULL, list);
5256
5257onError:
5258 Py_DECREF(list);
5259 return (PyObject *)item;
5260}
5261#endif
5262
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005263/* Argument converter. Coerces to a single unicode character */
5264
5265static int
5266convert_uc(PyObject *obj, void *addr)
5267{
5268 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5269 PyObject *uniobj;
5270 Py_UNICODE *unistr;
5271
5272 uniobj = PyUnicode_FromObject(obj);
5273 if (uniobj == NULL) {
5274 PyErr_SetString(PyExc_TypeError,
5275 "The fill character cannot be converted to Unicode");
5276 return 0;
5277 }
5278 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5279 PyErr_SetString(PyExc_TypeError,
5280 "The fill character must be exactly one character long");
5281 Py_DECREF(uniobj);
5282 return 0;
5283 }
5284 unistr = PyUnicode_AS_UNICODE(uniobj);
5285 *fillcharloc = unistr[0];
5286 Py_DECREF(uniobj);
5287 return 1;
5288}
5289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005290PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005291"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005293Return S centered in a Unicode string of length width. Padding is\n\
5294done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295
5296static PyObject *
5297unicode_center(PyUnicodeObject *self, PyObject *args)
5298{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005299 Py_ssize_t marg, left;
5300 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005301 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302
Thomas Woutersde017742006-02-16 19:34:37 +00005303 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 return NULL;
5305
Tim Peters7a29bd52001-09-12 03:03:31 +00005306 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 Py_INCREF(self);
5308 return (PyObject*) self;
5309 }
5310
5311 marg = width - self->length;
5312 left = marg / 2 + (marg & width & 1);
5313
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005314 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315}
5316
Marc-André Lemburge5034372000-08-08 08:04:29 +00005317#if 0
5318
5319/* This code should go into some future Unicode collation support
5320 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005321 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005322
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005323/* speedy UTF-16 code point order comparison */
5324/* gleaned from: */
5325/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5326
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005327static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005328{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005329 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005330 0, 0, 0, 0, 0, 0, 0, 0,
5331 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005332 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005333};
5334
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335static int
5336unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5337{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005338 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005339
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 Py_UNICODE *s1 = str1->str;
5341 Py_UNICODE *s2 = str2->str;
5342
5343 len1 = str1->length;
5344 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005345
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005347 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005348
5349 c1 = *s1++;
5350 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005351
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005352 if (c1 > (1<<11) * 26)
5353 c1 += utf16Fixup[c1>>11];
5354 if (c2 > (1<<11) * 26)
5355 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005356 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005357
5358 if (c1 != c2)
5359 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005360
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005361 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 }
5363
5364 return (len1 < len2) ? -1 : (len1 != len2);
5365}
5366
Marc-André Lemburge5034372000-08-08 08:04:29 +00005367#else
5368
5369static int
5370unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5371{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005372 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005373
5374 Py_UNICODE *s1 = str1->str;
5375 Py_UNICODE *s2 = str2->str;
5376
5377 len1 = str1->length;
5378 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005379
Marc-André Lemburge5034372000-08-08 08:04:29 +00005380 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005381 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005382
Fredrik Lundh45714e92001-06-26 16:39:36 +00005383 c1 = *s1++;
5384 c2 = *s2++;
5385
5386 if (c1 != c2)
5387 return (c1 < c2) ? -1 : 1;
5388
Marc-André Lemburge5034372000-08-08 08:04:29 +00005389 len1--; len2--;
5390 }
5391
5392 return (len1 < len2) ? -1 : (len1 != len2);
5393}
5394
5395#endif
5396
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397int PyUnicode_Compare(PyObject *left,
5398 PyObject *right)
5399{
5400 PyUnicodeObject *u = NULL, *v = NULL;
5401 int result;
5402
5403 /* Coerce the two arguments */
5404 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5405 if (u == NULL)
5406 goto onError;
5407 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5408 if (v == NULL)
5409 goto onError;
5410
Thomas Wouters7e474022000-07-16 12:04:32 +00005411 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 if (v == u) {
5413 Py_DECREF(u);
5414 Py_DECREF(v);
5415 return 0;
5416 }
5417
5418 result = unicode_compare(u, v);
5419
5420 Py_DECREF(u);
5421 Py_DECREF(v);
5422 return result;
5423
5424onError:
5425 Py_XDECREF(u);
5426 Py_XDECREF(v);
5427 return -1;
5428}
5429
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00005430PyObject *PyUnicode_RichCompare(PyObject *left,
5431 PyObject *right,
5432 int op)
5433{
5434 int result;
5435
5436 result = PyUnicode_Compare(left, right);
5437 if (result == -1 && PyErr_Occurred())
5438 goto onError;
5439
5440 /* Convert the return value to a Boolean */
5441 switch (op) {
5442 case Py_EQ:
5443 result = (result == 0);
5444 break;
5445 case Py_NE:
5446 result = (result != 0);
5447 break;
5448 case Py_LE:
5449 result = (result <= 0);
5450 break;
5451 case Py_GE:
5452 result = (result >= 0);
5453 break;
5454 case Py_LT:
5455 result = (result == -1);
5456 break;
5457 case Py_GT:
5458 result = (result == 1);
5459 break;
5460 }
5461 return PyBool_FromLong(result);
5462
5463 onError:
5464
5465 /* Standard case
5466
5467 Type errors mean that PyUnicode_FromObject() could not convert
5468 one of the arguments (usually the right hand side) to Unicode,
5469 ie. we can't handle the comparison request. However, it is
5470 possible that the other object knows a comparison method, which
5471 is why we return Py_NotImplemented to give the other object a
5472 chance.
5473
5474 */
5475 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5476 PyErr_Clear();
5477 Py_INCREF(Py_NotImplemented);
5478 return Py_NotImplemented;
5479 }
5480 if (op != Py_EQ && op != Py_NE)
5481 return NULL;
5482
5483 /* Equality comparison.
5484
5485 This is a special case: we silence any PyExc_UnicodeDecodeError
5486 and instead turn it into a PyErr_UnicodeWarning.
5487
5488 */
5489 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5490 return NULL;
5491 PyErr_Clear();
5492 if (PyErr_Warn(PyExc_UnicodeWarning,
5493 (op == Py_EQ) ?
5494 "Unicode equal comparison "
5495 "failed to convert both arguments to Unicode - "
5496 "interpreting them as being unequal" :
5497 "Unicode unequal comparison "
5498 "failed to convert both arguments to Unicode - "
5499 "interpreting them as being unequal"
5500 ) < 0)
5501 return NULL;
5502 result = (op == Py_NE);
5503 return PyBool_FromLong(result);
5504}
5505
Guido van Rossum403d68b2000-03-13 15:55:09 +00005506int PyUnicode_Contains(PyObject *container,
5507 PyObject *element)
5508{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005509 PyObject *str, *sub;
5510 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005511
5512 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005513 sub = PyUnicode_FromObject(element);
5514 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005515 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005516 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005517 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005518 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005519
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005520 str = PyUnicode_FromObject(container);
5521 if (!str) {
5522 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005523 return -1;
5524 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005525
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005526 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005527
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005528 Py_DECREF(str);
5529 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005530
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005531 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005532}
5533
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534/* Concat to string or Unicode object giving a new Unicode object. */
5535
5536PyObject *PyUnicode_Concat(PyObject *left,
5537 PyObject *right)
5538{
5539 PyUnicodeObject *u = NULL, *v = NULL, *w;
5540
5541 /* Coerce the two arguments */
5542 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5543 if (u == NULL)
5544 goto onError;
5545 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5546 if (v == NULL)
5547 goto onError;
5548
5549 /* Shortcuts */
5550 if (v == unicode_empty) {
5551 Py_DECREF(v);
5552 return (PyObject *)u;
5553 }
5554 if (u == unicode_empty) {
5555 Py_DECREF(u);
5556 return (PyObject *)v;
5557 }
5558
5559 /* Concat the two Unicode strings */
5560 w = _PyUnicode_New(u->length + v->length);
5561 if (w == NULL)
5562 goto onError;
5563 Py_UNICODE_COPY(w->str, u->str, u->length);
5564 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5565
5566 Py_DECREF(u);
5567 Py_DECREF(v);
5568 return (PyObject *)w;
5569
5570onError:
5571 Py_XDECREF(u);
5572 Py_XDECREF(v);
5573 return NULL;
5574}
5575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005576PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577"S.count(sub[, start[, end]]) -> int\n\
5578\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005579Return the number of non-overlapping occurrences of substring sub in\n\
5580Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005581interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
5583static PyObject *
5584unicode_count(PyUnicodeObject *self, PyObject *args)
5585{
5586 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005587 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005588 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 PyObject *result;
5590
Guido van Rossumb8872e62000-05-09 14:14:27 +00005591 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5592 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 return NULL;
5594
5595 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005596 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 if (substring == NULL)
5598 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005599
Fredrik Lundhc8162812006-05-26 19:33:03 +00005600 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005602 result = PyInt_FromSsize_t(
5603 stringlib_count(self->str + start, end - start,
5604 substring->str, substring->length)
5605 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606
5607 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005608
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 return result;
5610}
5611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005612PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005613"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005615Encodes S using the codec registered for encoding. encoding defaults\n\
5616to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005617handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5619'xmlcharrefreplace' as well as any other name registered with\n\
5620codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621
5622static PyObject *
5623unicode_encode(PyUnicodeObject *self, PyObject *args)
5624{
5625 char *encoding = NULL;
5626 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005627 PyObject *v;
5628
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5630 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005631 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005632 if (v == NULL)
5633 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005634 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5635 PyErr_Format(PyExc_TypeError,
5636 "encoder did not return a string/unicode object "
5637 "(type=%.400s)",
5638 v->ob_type->tp_name);
5639 Py_DECREF(v);
5640 return NULL;
5641 }
5642 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005643
5644 onError:
5645 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005646}
5647
5648PyDoc_STRVAR(decode__doc__,
5649"S.decode([encoding[,errors]]) -> string or unicode\n\
5650\n\
5651Decodes S using the codec registered for encoding. encoding defaults\n\
5652to the default encoding. errors may be given to set a different error\n\
5653handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5654a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5655as well as any other name registerd with codecs.register_error that is\n\
5656able to handle UnicodeDecodeErrors.");
5657
5658static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005659unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005660{
5661 char *encoding = NULL;
5662 char *errors = NULL;
5663 PyObject *v;
5664
5665 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5666 return NULL;
5667 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005668 if (v == NULL)
5669 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005670 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5671 PyErr_Format(PyExc_TypeError,
5672 "decoder did not return a string/unicode object "
5673 "(type=%.400s)",
5674 v->ob_type->tp_name);
5675 Py_DECREF(v);
5676 return NULL;
5677 }
5678 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005679
5680 onError:
5681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682}
5683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005684PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685"S.expandtabs([tabsize]) -> unicode\n\
5686\n\
5687Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005688If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
5690static PyObject*
5691unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5692{
5693 Py_UNICODE *e;
5694 Py_UNICODE *p;
5695 Py_UNICODE *q;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005696 Py_UNICODE *qe;
5697 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 PyUnicodeObject *u;
5699 int tabsize = 8;
5700
5701 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5702 return NULL;
5703
Thomas Wouters7e474022000-07-16 12:04:32 +00005704 /* First pass: determine size of output string */
Guido van Rossum44a93e52008-03-11 21:14:54 +00005705 i = 0; /* chars up to and including most recent \n or \r */
5706 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
5707 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 for (p = self->str; p < e; p++)
5709 if (*p == '\t') {
Neal Norwitz66e64e22007-06-09 04:06:30 +00005710 if (tabsize > 0) {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005711 incr = tabsize - (j % tabsize); /* cannot overflow */
5712 if (j > PY_SSIZE_T_MAX - incr)
5713 goto overflow1;
5714 j += incr;
5715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 }
5717 else {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005718 if (j > PY_SSIZE_T_MAX - 1)
5719 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 j++;
5721 if (*p == '\n' || *p == '\r') {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005722 if (i > PY_SSIZE_T_MAX - j)
5723 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 i += j;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005725 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 }
5727 }
5728
Guido van Rossum44a93e52008-03-11 21:14:54 +00005729 if (i > PY_SSIZE_T_MAX - j)
5730 goto overflow1;
Neal Norwitz66e64e22007-06-09 04:06:30 +00005731
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 /* Second pass: create output string and fill it */
5733 u = _PyUnicode_New(i + j);
5734 if (!u)
5735 return NULL;
5736
Guido van Rossum44a93e52008-03-11 21:14:54 +00005737 j = 0; /* same as in first pass */
5738 q = u->str; /* next output char */
5739 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740
5741 for (p = self->str; p < e; p++)
5742 if (*p == '\t') {
5743 if (tabsize > 0) {
5744 i = tabsize - (j % tabsize);
5745 j += i;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005746 while (i--) {
5747 if (q >= qe)
5748 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 *q++ = ' ';
Guido van Rossum44a93e52008-03-11 21:14:54 +00005750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 }
5752 }
5753 else {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005754 if (q >= qe)
5755 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 *q++ = *p;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005757 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 if (*p == '\n' || *p == '\r')
5759 j = 0;
5760 }
5761
5762 return (PyObject*) u;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005763
5764 overflow2:
5765 Py_DECREF(u);
5766 overflow1:
5767 PyErr_SetString(PyExc_OverflowError, "new string is too long");
5768 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769}
5770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005771PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772"S.find(sub [,start [,end]]) -> int\n\
5773\n\
5774Return the lowest index in S where substring sub is found,\n\
Georg Brandlb4d100c2007-07-29 17:37:22 +00005775such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776arguments start and end are interpreted as in slice notation.\n\
5777\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005778Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779
5780static PyObject *
5781unicode_find(PyUnicodeObject *self, PyObject *args)
5782{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005783 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005784 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005785 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005786 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787
Guido van Rossumb8872e62000-05-09 14:14:27 +00005788 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5789 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005791 substring = PyUnicode_FromObject(substring);
5792 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793 return NULL;
5794
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005795 result = stringlib_find_slice(
5796 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5797 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5798 start, end
5799 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
5801 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005802
5803 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804}
5805
5806static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005807unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808{
5809 if (index < 0 || index >= self->length) {
5810 PyErr_SetString(PyExc_IndexError, "string index out of range");
5811 return NULL;
5812 }
5813
5814 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5815}
5816
5817static long
5818unicode_hash(PyUnicodeObject *self)
5819{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005820 /* Since Unicode objects compare equal to their ASCII string
5821 counterparts, they should use the individual character values
5822 as basis for their hash value. This is needed to assure that
5823 strings and Unicode objects behave in the same way as
5824 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825
Martin v. Löwis18e16552006-02-15 17:27:45 +00005826 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005827 register Py_UNICODE *p;
5828 register long x;
5829
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 if (self->hash != -1)
5831 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005832 len = PyUnicode_GET_SIZE(self);
5833 p = PyUnicode_AS_UNICODE(self);
5834 x = *p << 7;
5835 while (--len >= 0)
5836 x = (1000003*x) ^ *p++;
5837 x ^= PyUnicode_GET_SIZE(self);
5838 if (x == -1)
5839 x = -2;
5840 self->hash = x;
5841 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842}
5843
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005844PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845"S.index(sub [,start [,end]]) -> int\n\
5846\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005847Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848
5849static PyObject *
5850unicode_index(PyUnicodeObject *self, PyObject *args)
5851{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005852 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005853 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005854 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005855 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856
Guido van Rossumb8872e62000-05-09 14:14:27 +00005857 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5858 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005860 substring = PyUnicode_FromObject(substring);
5861 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 return NULL;
5863
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005864 result = stringlib_find_slice(
5865 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5866 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5867 start, end
5868 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
5870 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005871
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 if (result < 0) {
5873 PyErr_SetString(PyExc_ValueError, "substring not found");
5874 return NULL;
5875 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005876
Martin v. Löwis18e16552006-02-15 17:27:45 +00005877 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878}
5879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005880PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005881"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005883Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005884at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885
5886static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005887unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888{
5889 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5890 register const Py_UNICODE *e;
5891 int cased;
5892
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 /* Shortcut for single character strings */
5894 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005895 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005897 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005898 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005899 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005900
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 e = p + PyUnicode_GET_SIZE(self);
5902 cased = 0;
5903 for (; p < e; p++) {
5904 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005905
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005907 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 else if (!cased && Py_UNICODE_ISLOWER(ch))
5909 cased = 1;
5910 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005911 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912}
5913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005914PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005915"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005917Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005918at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919
5920static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005921unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922{
5923 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5924 register const Py_UNICODE *e;
5925 int cased;
5926
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 /* Shortcut for single character strings */
5928 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005929 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005931 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005932 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005933 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005934
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 e = p + PyUnicode_GET_SIZE(self);
5936 cased = 0;
5937 for (; p < e; p++) {
5938 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005939
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005941 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 else if (!cased && Py_UNICODE_ISUPPER(ch))
5943 cased = 1;
5944 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005945 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946}
5947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005948PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005949"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005951Return True if S is a titlecased string and there is at least one\n\
5952character in S, i.e. upper- and titlecase characters may only\n\
5953follow uncased characters and lowercase characters only cased ones.\n\
5954Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955
5956static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005957unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958{
5959 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5960 register const Py_UNICODE *e;
5961 int cased, previous_is_cased;
5962
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 /* Shortcut for single character strings */
5964 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005965 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5966 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005968 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005969 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005970 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005971
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 e = p + PyUnicode_GET_SIZE(self);
5973 cased = 0;
5974 previous_is_cased = 0;
5975 for (; p < e; p++) {
5976 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005977
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5979 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005980 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 previous_is_cased = 1;
5982 cased = 1;
5983 }
5984 else if (Py_UNICODE_ISLOWER(ch)) {
5985 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005986 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 previous_is_cased = 1;
5988 cased = 1;
5989 }
5990 else
5991 previous_is_cased = 0;
5992 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005993 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994}
5995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005996PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005997"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005999Return True if all characters in S are whitespace\n\
6000and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001
6002static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006003unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004{
6005 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6006 register const Py_UNICODE *e;
6007
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 /* Shortcut for single character strings */
6009 if (PyUnicode_GET_SIZE(self) == 1 &&
6010 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006011 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006013 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006014 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006015 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006016
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 e = p + PyUnicode_GET_SIZE(self);
6018 for (; p < e; p++) {
6019 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006020 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006022 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023}
6024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006025PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006026"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006027\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006028Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006029and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006030
6031static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006032unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006033{
6034 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6035 register const Py_UNICODE *e;
6036
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006037 /* Shortcut for single character strings */
6038 if (PyUnicode_GET_SIZE(self) == 1 &&
6039 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006040 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006041
6042 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006043 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006044 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006045
6046 e = p + PyUnicode_GET_SIZE(self);
6047 for (; p < e; p++) {
6048 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006049 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006050 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006051 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006052}
6053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006054PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006055"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006056\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006057Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006058and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006059
6060static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006061unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006062{
6063 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6064 register const Py_UNICODE *e;
6065
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006066 /* Shortcut for single character strings */
6067 if (PyUnicode_GET_SIZE(self) == 1 &&
6068 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006069 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006070
6071 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006072 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006073 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006074
6075 e = p + PyUnicode_GET_SIZE(self);
6076 for (; p < e; p++) {
6077 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006078 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006079 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006080 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006081}
6082
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006083PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006084"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006086Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006087False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088
6089static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006090unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091{
6092 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6093 register const Py_UNICODE *e;
6094
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 /* Shortcut for single character strings */
6096 if (PyUnicode_GET_SIZE(self) == 1 &&
6097 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006098 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006100 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006101 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006102 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006103
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 e = p + PyUnicode_GET_SIZE(self);
6105 for (; p < e; p++) {
6106 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006107 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006109 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110}
6111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006112PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006113"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006115Return True if all characters in S are digits\n\
6116and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117
6118static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006119unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120{
6121 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6122 register const Py_UNICODE *e;
6123
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 /* Shortcut for single character strings */
6125 if (PyUnicode_GET_SIZE(self) == 1 &&
6126 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006127 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006129 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006130 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006131 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006132
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 e = p + PyUnicode_GET_SIZE(self);
6134 for (; p < e; p++) {
6135 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006136 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006138 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139}
6140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006141PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006142"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006144Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006145False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146
6147static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006148unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149{
6150 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6151 register const Py_UNICODE *e;
6152
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 /* Shortcut for single character strings */
6154 if (PyUnicode_GET_SIZE(self) == 1 &&
6155 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006156 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006158 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006159 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006160 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006161
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 e = p + PyUnicode_GET_SIZE(self);
6163 for (; p < e; p++) {
6164 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006165 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006167 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168}
6169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006170PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171"S.join(sequence) -> unicode\n\
6172\n\
6173Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006174sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175
6176static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006177unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006179 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180}
6181
Martin v. Löwis18e16552006-02-15 17:27:45 +00006182static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183unicode_length(PyUnicodeObject *self)
6184{
6185 return self->length;
6186}
6187
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006188PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006189"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190\n\
6191Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006192done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193
6194static PyObject *
6195unicode_ljust(PyUnicodeObject *self, PyObject *args)
6196{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006197 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006198 Py_UNICODE fillchar = ' ';
6199
Martin v. Löwis412fb672006-04-13 06:34:32 +00006200 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 return NULL;
6202
Tim Peters7a29bd52001-09-12 03:03:31 +00006203 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 Py_INCREF(self);
6205 return (PyObject*) self;
6206 }
6207
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006208 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209}
6210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006211PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212"S.lower() -> unicode\n\
6213\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006214Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215
6216static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006217unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 return fixup(self, fixlower);
6220}
6221
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006222#define LEFTSTRIP 0
6223#define RIGHTSTRIP 1
6224#define BOTHSTRIP 2
6225
6226/* Arrays indexed by above */
6227static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6228
6229#define STRIPNAME(i) (stripformat[i]+3)
6230
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006231/* externally visible for str.strip(unicode) */
6232PyObject *
6233_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6234{
6235 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006236 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006237 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006238 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6239 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006240
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006241 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6242
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006243 i = 0;
6244 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006245 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6246 i++;
6247 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006248 }
6249
6250 j = len;
6251 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006252 do {
6253 j--;
6254 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6255 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006256 }
6257
6258 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006259 Py_INCREF(self);
6260 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006261 }
6262 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006263 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006264}
6265
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266
6267static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006268do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006270 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006271 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006272
6273 i = 0;
6274 if (striptype != RIGHTSTRIP) {
6275 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6276 i++;
6277 }
6278 }
6279
6280 j = len;
6281 if (striptype != LEFTSTRIP) {
6282 do {
6283 j--;
6284 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6285 j++;
6286 }
6287
6288 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6289 Py_INCREF(self);
6290 return (PyObject*)self;
6291 }
6292 else
6293 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294}
6295
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006296
6297static PyObject *
6298do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6299{
6300 PyObject *sep = NULL;
6301
6302 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6303 return NULL;
6304
6305 if (sep != NULL && sep != Py_None) {
6306 if (PyUnicode_Check(sep))
6307 return _PyUnicode_XStrip(self, striptype, sep);
6308 else if (PyString_Check(sep)) {
6309 PyObject *res;
6310 sep = PyUnicode_FromObject(sep);
6311 if (sep==NULL)
6312 return NULL;
6313 res = _PyUnicode_XStrip(self, striptype, sep);
6314 Py_DECREF(sep);
6315 return res;
6316 }
6317 else {
6318 PyErr_Format(PyExc_TypeError,
6319 "%s arg must be None, unicode or str",
6320 STRIPNAME(striptype));
6321 return NULL;
6322 }
6323 }
6324
6325 return do_strip(self, striptype);
6326}
6327
6328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006329PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006330"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006331\n\
6332Return a copy of the string S with leading and trailing\n\
6333whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006334If chars is given and not None, remove characters in chars instead.\n\
6335If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006336
6337static PyObject *
6338unicode_strip(PyUnicodeObject *self, PyObject *args)
6339{
6340 if (PyTuple_GET_SIZE(args) == 0)
6341 return do_strip(self, BOTHSTRIP); /* Common case */
6342 else
6343 return do_argstrip(self, BOTHSTRIP, args);
6344}
6345
6346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006347PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006348"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006349\n\
6350Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006351If chars is given and not None, remove characters in chars instead.\n\
6352If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006353
6354static PyObject *
6355unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6356{
6357 if (PyTuple_GET_SIZE(args) == 0)
6358 return do_strip(self, LEFTSTRIP); /* Common case */
6359 else
6360 return do_argstrip(self, LEFTSTRIP, args);
6361}
6362
6363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006364PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006365"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006366\n\
6367Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006368If chars is given and not None, remove characters in chars instead.\n\
6369If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006370
6371static PyObject *
6372unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6373{
6374 if (PyTuple_GET_SIZE(args) == 0)
6375 return do_strip(self, RIGHTSTRIP); /* Common case */
6376 else
6377 return do_argstrip(self, RIGHTSTRIP, args);
6378}
6379
6380
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006382unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383{
6384 PyUnicodeObject *u;
6385 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006386 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006387 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388
6389 if (len < 0)
6390 len = 0;
6391
Tim Peters7a29bd52001-09-12 03:03:31 +00006392 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393 /* no repeat, return original string */
6394 Py_INCREF(str);
6395 return (PyObject*) str;
6396 }
Tim Peters8f422462000-09-09 06:13:41 +00006397
6398 /* ensure # of chars needed doesn't overflow int and # of bytes
6399 * needed doesn't overflow size_t
6400 */
6401 nchars = len * str->length;
6402 if (len && nchars / len != str->length) {
6403 PyErr_SetString(PyExc_OverflowError,
6404 "repeated string is too long");
6405 return NULL;
6406 }
6407 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6408 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6409 PyErr_SetString(PyExc_OverflowError,
6410 "repeated string is too long");
6411 return NULL;
6412 }
6413 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 if (!u)
6415 return NULL;
6416
6417 p = u->str;
6418
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006419 if (str->length == 1 && len > 0) {
6420 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006421 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006422 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006423 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006424 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006425 done = str->length;
6426 }
6427 while (done < nchars) {
6428 int n = (done <= nchars-done) ? done : nchars-done;
6429 Py_UNICODE_COPY(p+done, p, n);
6430 done += n;
6431 }
6432 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433
6434 return (PyObject*) u;
6435}
6436
6437PyObject *PyUnicode_Replace(PyObject *obj,
6438 PyObject *subobj,
6439 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006440 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441{
6442 PyObject *self;
6443 PyObject *str1;
6444 PyObject *str2;
6445 PyObject *result;
6446
6447 self = PyUnicode_FromObject(obj);
6448 if (self == NULL)
6449 return NULL;
6450 str1 = PyUnicode_FromObject(subobj);
6451 if (str1 == NULL) {
6452 Py_DECREF(self);
6453 return NULL;
6454 }
6455 str2 = PyUnicode_FromObject(replobj);
6456 if (str2 == NULL) {
6457 Py_DECREF(self);
6458 Py_DECREF(str1);
6459 return NULL;
6460 }
Tim Petersced69f82003-09-16 20:30:58 +00006461 result = replace((PyUnicodeObject *)self,
6462 (PyUnicodeObject *)str1,
6463 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 maxcount);
6465 Py_DECREF(self);
6466 Py_DECREF(str1);
6467 Py_DECREF(str2);
6468 return result;
6469}
6470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006471PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472"S.replace (old, new[, maxsplit]) -> unicode\n\
6473\n\
6474Return a copy of S with all occurrences of substring\n\
6475old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006476given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477
6478static PyObject*
6479unicode_replace(PyUnicodeObject *self, PyObject *args)
6480{
6481 PyUnicodeObject *str1;
6482 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006483 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 PyObject *result;
6485
Martin v. Löwis18e16552006-02-15 17:27:45 +00006486 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 return NULL;
6488 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6489 if (str1 == NULL)
6490 return NULL;
6491 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006492 if (str2 == NULL) {
6493 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496
6497 result = replace(self, str1, str2, maxcount);
6498
6499 Py_DECREF(str1);
6500 Py_DECREF(str2);
6501 return result;
6502}
6503
6504static
6505PyObject *unicode_repr(PyObject *unicode)
6506{
6507 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6508 PyUnicode_GET_SIZE(unicode),
6509 1);
6510}
6511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006512PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513"S.rfind(sub [,start [,end]]) -> int\n\
6514\n\
6515Return the highest index in S where substring sub is found,\n\
Georg Brandlb4d100c2007-07-29 17:37:22 +00006516such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517arguments start and end are interpreted as in slice notation.\n\
6518\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006519Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520
6521static PyObject *
6522unicode_rfind(PyUnicodeObject *self, PyObject *args)
6523{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006524 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006525 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006526 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006527 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528
Guido van Rossumb8872e62000-05-09 14:14:27 +00006529 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6530 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006532 substring = PyUnicode_FromObject(substring);
6533 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 return NULL;
6535
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006536 result = stringlib_rfind_slice(
6537 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6538 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6539 start, end
6540 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541
6542 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006543
6544 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545}
6546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006547PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548"S.rindex(sub [,start [,end]]) -> int\n\
6549\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006550Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551
6552static PyObject *
6553unicode_rindex(PyUnicodeObject *self, PyObject *args)
6554{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006555 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006556 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006557 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006558 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559
Guido van Rossumb8872e62000-05-09 14:14:27 +00006560 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6561 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006563 substring = PyUnicode_FromObject(substring);
6564 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 return NULL;
6566
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006567 result = stringlib_rfind_slice(
6568 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6569 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6570 start, end
6571 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572
6573 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006574
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 if (result < 0) {
6576 PyErr_SetString(PyExc_ValueError, "substring not found");
6577 return NULL;
6578 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006579 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580}
6581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006582PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006583"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584\n\
6585Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006586done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587
6588static PyObject *
6589unicode_rjust(PyUnicodeObject *self, PyObject *args)
6590{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006591 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006592 Py_UNICODE fillchar = ' ';
6593
Martin v. Löwis412fb672006-04-13 06:34:32 +00006594 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 return NULL;
6596
Tim Peters7a29bd52001-09-12 03:03:31 +00006597 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 Py_INCREF(self);
6599 return (PyObject*) self;
6600 }
6601
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006602 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603}
6604
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006606unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607{
6608 /* standard clamping */
6609 if (start < 0)
6610 start = 0;
6611 if (end < 0)
6612 end = 0;
6613 if (end > self->length)
6614 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006615 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 /* full slice, return original string */
6617 Py_INCREF(self);
6618 return (PyObject*) self;
6619 }
6620 if (start > end)
6621 start = end;
6622 /* copy slice */
6623 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6624 end - start);
6625}
6626
6627PyObject *PyUnicode_Split(PyObject *s,
6628 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006629 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630{
6631 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006632
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 s = PyUnicode_FromObject(s);
6634 if (s == NULL)
6635 return NULL;
6636 if (sep != NULL) {
6637 sep = PyUnicode_FromObject(sep);
6638 if (sep == NULL) {
6639 Py_DECREF(s);
6640 return NULL;
6641 }
6642 }
6643
6644 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6645
6646 Py_DECREF(s);
6647 Py_XDECREF(sep);
6648 return result;
6649}
6650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006651PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652"S.split([sep [,maxsplit]]) -> list of strings\n\
6653\n\
6654Return a list of the words in S, using sep as the\n\
6655delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006656splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006657any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658
6659static PyObject*
6660unicode_split(PyUnicodeObject *self, PyObject *args)
6661{
6662 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006663 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664
Martin v. Löwis18e16552006-02-15 17:27:45 +00006665 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 return NULL;
6667
6668 if (substring == Py_None)
6669 return split(self, NULL, maxcount);
6670 else if (PyUnicode_Check(substring))
6671 return split(self, (PyUnicodeObject *)substring, maxcount);
6672 else
6673 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6674}
6675
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006676PyObject *
6677PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6678{
6679 PyObject* str_obj;
6680 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006681 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006682
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006683 str_obj = PyUnicode_FromObject(str_in);
6684 if (!str_obj)
6685 return NULL;
6686 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006687 if (!sep_obj) {
6688 Py_DECREF(str_obj);
6689 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006690 }
6691
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006692 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006693 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6694 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6695 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006696
Fredrik Lundhb9479482006-05-26 17:22:38 +00006697 Py_DECREF(sep_obj);
6698 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006699
6700 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006701}
6702
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006703
6704PyObject *
6705PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6706{
6707 PyObject* str_obj;
6708 PyObject* sep_obj;
6709 PyObject* out;
6710
6711 str_obj = PyUnicode_FromObject(str_in);
6712 if (!str_obj)
6713 return NULL;
6714 sep_obj = PyUnicode_FromObject(sep_in);
6715 if (!sep_obj) {
6716 Py_DECREF(str_obj);
6717 return NULL;
6718 }
6719
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006720 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006721 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6722 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6723 );
6724
6725 Py_DECREF(sep_obj);
6726 Py_DECREF(str_obj);
6727
6728 return out;
6729}
6730
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006731PyDoc_STRVAR(partition__doc__,
6732"S.partition(sep) -> (head, sep, tail)\n\
6733\n\
6734Searches for the separator sep in S, and returns the part before it,\n\
6735the separator itself, and the part after it. If the separator is not\n\
6736found, returns S and two empty strings.");
6737
6738static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006739unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006740{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006741 return PyUnicode_Partition((PyObject *)self, separator);
6742}
6743
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006744PyDoc_STRVAR(rpartition__doc__,
Neal Norwitz29a5fdb2006-09-05 02:21:38 +00006745"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006746\n\
6747Searches for the separator sep in S, starting at the end of S, and returns\n\
6748the part before it, the separator itself, and the part after it. If the\n\
Neal Norwitz29a5fdb2006-09-05 02:21:38 +00006749separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006750
6751static PyObject*
6752unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6753{
6754 return PyUnicode_RPartition((PyObject *)self, separator);
6755}
6756
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006757PyObject *PyUnicode_RSplit(PyObject *s,
6758 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006759 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006760{
6761 PyObject *result;
6762
6763 s = PyUnicode_FromObject(s);
6764 if (s == NULL)
6765 return NULL;
6766 if (sep != NULL) {
6767 sep = PyUnicode_FromObject(sep);
6768 if (sep == NULL) {
6769 Py_DECREF(s);
6770 return NULL;
6771 }
6772 }
6773
6774 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6775
6776 Py_DECREF(s);
6777 Py_XDECREF(sep);
6778 return result;
6779}
6780
6781PyDoc_STRVAR(rsplit__doc__,
6782"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6783\n\
6784Return a list of the words in S, using sep as the\n\
6785delimiter string, starting at the end of the string and\n\
6786working to the front. If maxsplit is given, at most maxsplit\n\
6787splits are done. If sep is not specified, any whitespace string\n\
6788is a separator.");
6789
6790static PyObject*
6791unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6792{
6793 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006794 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006795
Martin v. Löwis18e16552006-02-15 17:27:45 +00006796 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006797 return NULL;
6798
6799 if (substring == Py_None)
6800 return rsplit(self, NULL, maxcount);
6801 else if (PyUnicode_Check(substring))
6802 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6803 else
6804 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6805}
6806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006807PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006808"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809\n\
6810Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006811Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006812is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813
6814static PyObject*
6815unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6816{
Guido van Rossum86662912000-04-11 15:38:46 +00006817 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818
Guido van Rossum86662912000-04-11 15:38:46 +00006819 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 return NULL;
6821
Guido van Rossum86662912000-04-11 15:38:46 +00006822 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823}
6824
6825static
6826PyObject *unicode_str(PyUnicodeObject *self)
6827{
Fred Drakee4315f52000-05-09 19:53:39 +00006828 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829}
6830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006831PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832"S.swapcase() -> unicode\n\
6833\n\
6834Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006835and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836
6837static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006838unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 return fixup(self, fixswapcase);
6841}
6842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006843PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844"S.translate(table) -> unicode\n\
6845\n\
6846Return a copy of the string S, where all characters have been mapped\n\
6847through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006848Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6849Unmapped characters are left untouched. Characters mapped to None\n\
6850are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851
6852static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006853unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854{
Tim Petersced69f82003-09-16 20:30:58 +00006855 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006857 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 "ignore");
6859}
6860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006861PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862"S.upper() -> unicode\n\
6863\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006864Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865
6866static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006867unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 return fixup(self, fixupper);
6870}
6871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006872PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873"S.zfill(width) -> unicode\n\
6874\n\
6875Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006876of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877
6878static PyObject *
6879unicode_zfill(PyUnicodeObject *self, PyObject *args)
6880{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006881 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 PyUnicodeObject *u;
6883
Martin v. Löwis18e16552006-02-15 17:27:45 +00006884 Py_ssize_t width;
6885 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886 return NULL;
6887
6888 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006889 if (PyUnicode_CheckExact(self)) {
6890 Py_INCREF(self);
6891 return (PyObject*) self;
6892 }
6893 else
6894 return PyUnicode_FromUnicode(
6895 PyUnicode_AS_UNICODE(self),
6896 PyUnicode_GET_SIZE(self)
6897 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 }
6899
6900 fill = width - self->length;
6901
6902 u = pad(self, fill, 0, '0');
6903
Walter Dörwald068325e2002-04-15 13:36:47 +00006904 if (u == NULL)
6905 return NULL;
6906
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 if (u->str[fill] == '+' || u->str[fill] == '-') {
6908 /* move sign to beginning of string */
6909 u->str[0] = u->str[fill];
6910 u->str[fill] = '0';
6911 }
6912
6913 return (PyObject*) u;
6914}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915
6916#if 0
6917static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006918unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920 return PyInt_FromLong(unicode_freelist_size);
6921}
6922#endif
6923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006924PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006925"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006927Return True if S starts with the specified prefix, False otherwise.\n\
6928With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006929With optional end, stop comparing S at that position.\n\
6930prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931
6932static PyObject *
6933unicode_startswith(PyUnicodeObject *self,
6934 PyObject *args)
6935{
Georg Brandl24250812006-06-09 18:45:48 +00006936 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006938 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006939 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006940 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941
Georg Brandl24250812006-06-09 18:45:48 +00006942 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006943 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006945 if (PyTuple_Check(subobj)) {
6946 Py_ssize_t i;
6947 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6948 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6949 PyTuple_GET_ITEM(subobj, i));
6950 if (substring == NULL)
6951 return NULL;
6952 result = tailmatch(self, substring, start, end, -1);
6953 Py_DECREF(substring);
6954 if (result) {
6955 Py_RETURN_TRUE;
6956 }
6957 }
6958 /* nothing matched */
6959 Py_RETURN_FALSE;
6960 }
6961 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00006963 return NULL;
6964 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00006966 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967}
6968
6969
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006970PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006971"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006973Return True if S ends with the specified suffix, False otherwise.\n\
6974With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006975With optional end, stop comparing S at that position.\n\
6976suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977
6978static PyObject *
6979unicode_endswith(PyUnicodeObject *self,
6980 PyObject *args)
6981{
Georg Brandl24250812006-06-09 18:45:48 +00006982 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006984 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006985 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006986 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987
Georg Brandl24250812006-06-09 18:45:48 +00006988 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6989 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006991 if (PyTuple_Check(subobj)) {
6992 Py_ssize_t i;
6993 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6994 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6995 PyTuple_GET_ITEM(subobj, i));
6996 if (substring == NULL)
6997 return NULL;
6998 result = tailmatch(self, substring, start, end, +1);
6999 Py_DECREF(substring);
7000 if (result) {
7001 Py_RETURN_TRUE;
7002 }
7003 }
7004 Py_RETURN_FALSE;
7005 }
7006 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007008 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009
Georg Brandl24250812006-06-09 18:45:48 +00007010 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007012 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013}
7014
7015
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007016
7017static PyObject *
7018unicode_getnewargs(PyUnicodeObject *v)
7019{
7020 return Py_BuildValue("(u#)", v->str, v->length);
7021}
7022
7023
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024static PyMethodDef unicode_methods[] = {
7025
7026 /* Order is according to common usage: often used methods should
7027 appear first, since lookup is done sequentially. */
7028
Georg Brandlecdc0a92006-03-30 12:19:07 +00007029 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007030 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7031 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007032 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007033 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7034 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7035 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7036 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7037 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7038 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7039 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007040 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007041 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7042 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7043 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007044 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007045 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007046/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7047 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7048 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7049 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007050 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007051 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007052 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007053 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007054 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7055 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7056 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7057 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7058 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7059 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7060 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7061 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7062 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7063 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7064 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7065 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7066 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7067 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007068 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007069#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007070 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071#endif
7072
7073#if 0
7074 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007075 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076#endif
7077
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007078 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079 {NULL, NULL}
7080};
7081
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007082static PyObject *
7083unicode_mod(PyObject *v, PyObject *w)
7084{
7085 if (!PyUnicode_Check(v)) {
7086 Py_INCREF(Py_NotImplemented);
7087 return Py_NotImplemented;
7088 }
7089 return PyUnicode_Format(v, w);
7090}
7091
7092static PyNumberMethods unicode_as_number = {
7093 0, /*nb_add*/
7094 0, /*nb_subtract*/
7095 0, /*nb_multiply*/
7096 0, /*nb_divide*/
7097 unicode_mod, /*nb_remainder*/
7098};
7099
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007101 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007102 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007103 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7104 (ssizeargfunc) unicode_getitem, /* sq_item */
7105 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106 0, /* sq_ass_item */
7107 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007108 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109};
7110
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007111static PyObject*
7112unicode_subscript(PyUnicodeObject* self, PyObject* item)
7113{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007114 if (PyIndex_Check(item)) {
7115 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007116 if (i == -1 && PyErr_Occurred())
7117 return NULL;
7118 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007119 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007120 return unicode_getitem(self, i);
7121 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007122 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007123 Py_UNICODE* source_buf;
7124 Py_UNICODE* result_buf;
7125 PyObject* result;
7126
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007127 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007128 &start, &stop, &step, &slicelength) < 0) {
7129 return NULL;
7130 }
7131
7132 if (slicelength <= 0) {
7133 return PyUnicode_FromUnicode(NULL, 0);
7134 } else {
7135 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitzb3635f92008-03-18 04:17:36 +00007136 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7137 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007138
7139 if (result_buf == NULL)
7140 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007141
7142 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7143 result_buf[i] = source_buf[cur];
7144 }
Tim Petersced69f82003-09-16 20:30:58 +00007145
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007146 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitzb3635f92008-03-18 04:17:36 +00007147 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007148 return result;
7149 }
7150 } else {
7151 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7152 return NULL;
7153 }
7154}
7155
7156static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007157 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007158 (binaryfunc)unicode_subscript, /* mp_subscript */
7159 (objobjargproc)0, /* mp_ass_subscript */
7160};
7161
Martin v. Löwis18e16552006-02-15 17:27:45 +00007162static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007164 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165 const void **ptr)
7166{
7167 if (index != 0) {
7168 PyErr_SetString(PyExc_SystemError,
7169 "accessing non-existent unicode segment");
7170 return -1;
7171 }
7172 *ptr = (void *) self->str;
7173 return PyUnicode_GET_DATA_SIZE(self);
7174}
7175
Martin v. Löwis18e16552006-02-15 17:27:45 +00007176static Py_ssize_t
7177unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178 const void **ptr)
7179{
7180 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007181 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 return -1;
7183}
7184
7185static int
7186unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007187 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188{
7189 if (lenp)
7190 *lenp = PyUnicode_GET_DATA_SIZE(self);
7191 return 1;
7192}
7193
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007194static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007196 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197 const void **ptr)
7198{
7199 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007200
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 if (index != 0) {
7202 PyErr_SetString(PyExc_SystemError,
7203 "accessing non-existent unicode segment");
7204 return -1;
7205 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007206 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 if (str == NULL)
7208 return -1;
7209 *ptr = (void *) PyString_AS_STRING(str);
7210 return PyString_GET_SIZE(str);
7211}
7212
7213/* Helpers for PyUnicode_Format() */
7214
7215static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007216getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007218 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 if (argidx < arglen) {
7220 (*p_argidx)++;
7221 if (arglen < 0)
7222 return args;
7223 else
7224 return PyTuple_GetItem(args, argidx);
7225 }
7226 PyErr_SetString(PyExc_TypeError,
7227 "not enough arguments for format string");
7228 return NULL;
7229}
7230
7231#define F_LJUST (1<<0)
7232#define F_SIGN (1<<1)
7233#define F_BLANK (1<<2)
7234#define F_ALT (1<<3)
7235#define F_ZERO (1<<4)
7236
Martin v. Löwis18e16552006-02-15 17:27:45 +00007237static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007238strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007240 register Py_ssize_t i;
7241 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 for (i = len - 1; i >= 0; i--)
7243 buffer[i] = (Py_UNICODE) charbuffer[i];
7244
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 return len;
7246}
7247
Neal Norwitzfc76d632006-01-10 06:03:13 +00007248static int
7249doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7250{
Tim Peters15231542006-02-16 01:08:01 +00007251 Py_ssize_t result;
7252
Neal Norwitzfc76d632006-01-10 06:03:13 +00007253 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007254 result = strtounicode(buffer, (char *)buffer);
7255 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007256}
7257
7258static int
7259longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7260{
Tim Peters15231542006-02-16 01:08:01 +00007261 Py_ssize_t result;
7262
Neal Norwitzfc76d632006-01-10 06:03:13 +00007263 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007264 result = strtounicode(buffer, (char *)buffer);
7265 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007266}
7267
Guido van Rossum078151d2002-08-11 04:24:12 +00007268/* XXX To save some code duplication, formatfloat/long/int could have been
7269 shared with stringobject.c, converting from 8-bit to Unicode after the
7270 formatting is done. */
7271
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272static int
7273formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007274 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 int flags,
7276 int prec,
7277 int type,
7278 PyObject *v)
7279{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007280 /* fmt = '%#.' + `prec` + `type`
7281 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282 char fmt[20];
7283 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007284
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 x = PyFloat_AsDouble(v);
7286 if (x == -1.0 && PyErr_Occurred())
7287 return -1;
7288 if (prec < 0)
7289 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7291 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007292 /* Worst case length calc to ensure no buffer overrun:
7293
7294 'g' formats:
7295 fmt = %#.<prec>g
7296 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7297 for any double rep.)
7298 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7299
7300 'f' formats:
7301 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7302 len = 1 + 50 + 1 + prec = 52 + prec
7303
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007304 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007305 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007306
7307 */
Georg Brandlc5db9232007-07-12 08:38:04 +00007308 if (((type == 'g' || type == 'G') &&
7309 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007310 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007311 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007312 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007313 return -1;
7314 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007315 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7316 (flags&F_ALT) ? "#" : "",
7317 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007318 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319}
7320
Tim Peters38fd5b62000-09-21 05:43:11 +00007321static PyObject*
7322formatlong(PyObject *val, int flags, int prec, int type)
7323{
7324 char *buf;
7325 int i, len;
7326 PyObject *str; /* temporary string object. */
7327 PyUnicodeObject *result;
7328
7329 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7330 if (!str)
7331 return NULL;
7332 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007333 if (!result) {
7334 Py_DECREF(str);
7335 return NULL;
7336 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007337 for (i = 0; i < len; i++)
7338 result->str[i] = buf[i];
7339 result->str[len] = 0;
7340 Py_DECREF(str);
7341 return (PyObject*)result;
7342}
7343
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344static int
7345formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007346 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 int flags,
7348 int prec,
7349 int type,
7350 PyObject *v)
7351{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007352 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007353 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7354 * + 1 + 1
7355 * = 24
7356 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007357 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007358 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359 long x;
7360
7361 x = PyInt_AsLong(v);
7362 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007363 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007364 if (x < 0 && type == 'u') {
7365 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007366 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007367 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7368 sign = "-";
7369 else
7370 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007372 prec = 1;
7373
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007374 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7375 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007376 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007377 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007378 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007379 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007380 return -1;
7381 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007382
7383 if ((flags & F_ALT) &&
7384 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007385 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007386 * of issues that cause pain:
7387 * - when 0 is being converted, the C standard leaves off
7388 * the '0x' or '0X', which is inconsistent with other
7389 * %#x/%#X conversions and inconsistent with Python's
7390 * hex() function
7391 * - there are platforms that violate the standard and
7392 * convert 0 with the '0x' or '0X'
7393 * (Metrowerks, Compaq Tru64)
7394 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007395 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007396 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007397 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007398 * We can achieve the desired consistency by inserting our
7399 * own '0x' or '0X' prefix, and substituting %x/%X in place
7400 * of %#x/%#X.
7401 *
7402 * Note that this is the same approach as used in
7403 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007404 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007405 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7406 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007407 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007408 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007409 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7410 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007411 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007412 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007413 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007414 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007415 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007416 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417}
7418
7419static int
7420formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007421 size_t buflen,
7422 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007424 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007425 if (PyUnicode_Check(v)) {
7426 if (PyUnicode_GET_SIZE(v) != 1)
7427 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007431 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007432 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007433 goto onError;
7434 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7435 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436
7437 else {
7438 /* Integer input truncated to a character */
7439 long x;
7440 x = PyInt_AsLong(v);
7441 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007442 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007443#ifdef Py_UNICODE_WIDE
7444 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007445 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007446 "%c arg not in range(0x110000) "
7447 "(wide Python build)");
7448 return -1;
7449 }
7450#else
7451 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007452 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007453 "%c arg not in range(0x10000) "
7454 "(narrow Python build)");
7455 return -1;
7456 }
7457#endif
7458 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 }
7460 buf[1] = '\0';
7461 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007462
7463 onError:
7464 PyErr_SetString(PyExc_TypeError,
7465 "%c requires int or char");
7466 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467}
7468
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007469/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7470
7471 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7472 chars are formatted. XXX This is a magic number. Each formatting
7473 routine does bounds checking to ensure no overflow, but a better
7474 solution may be to malloc a buffer of appropriate size for each
7475 format. For now, the current solution is sufficient.
7476*/
7477#define FORMATBUFLEN (size_t)120
7478
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479PyObject *PyUnicode_Format(PyObject *format,
7480 PyObject *args)
7481{
7482 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007483 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484 int args_owned = 0;
7485 PyUnicodeObject *result = NULL;
7486 PyObject *dict = NULL;
7487 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007488
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489 if (format == NULL || args == NULL) {
7490 PyErr_BadInternalCall();
7491 return NULL;
7492 }
7493 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007494 if (uformat == NULL)
7495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 fmt = PyUnicode_AS_UNICODE(uformat);
7497 fmtcnt = PyUnicode_GET_SIZE(uformat);
7498
7499 reslen = rescnt = fmtcnt + 100;
7500 result = _PyUnicode_New(reslen);
7501 if (result == NULL)
7502 goto onError;
7503 res = PyUnicode_AS_UNICODE(result);
7504
7505 if (PyTuple_Check(args)) {
7506 arglen = PyTuple_Size(args);
7507 argidx = 0;
7508 }
7509 else {
7510 arglen = -1;
7511 argidx = -2;
7512 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007513 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7514 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 dict = args;
7516
7517 while (--fmtcnt >= 0) {
7518 if (*fmt != '%') {
7519 if (--rescnt < 0) {
7520 rescnt = fmtcnt + 100;
7521 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007522 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007523 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7525 --rescnt;
7526 }
7527 *res++ = *fmt++;
7528 }
7529 else {
7530 /* Got a format specifier */
7531 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007532 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534 Py_UNICODE c = '\0';
7535 Py_UNICODE fill;
7536 PyObject *v = NULL;
7537 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007538 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007540 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007541 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542
7543 fmt++;
7544 if (*fmt == '(') {
7545 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007546 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547 PyObject *key;
7548 int pcount = 1;
7549
7550 if (dict == NULL) {
7551 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007552 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553 goto onError;
7554 }
7555 ++fmt;
7556 --fmtcnt;
7557 keystart = fmt;
7558 /* Skip over balanced parentheses */
7559 while (pcount > 0 && --fmtcnt >= 0) {
7560 if (*fmt == ')')
7561 --pcount;
7562 else if (*fmt == '(')
7563 ++pcount;
7564 fmt++;
7565 }
7566 keylen = fmt - keystart - 1;
7567 if (fmtcnt < 0 || pcount > 0) {
7568 PyErr_SetString(PyExc_ValueError,
7569 "incomplete format key");
7570 goto onError;
7571 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007572#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007573 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574 then looked up since Python uses strings to hold
7575 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007576 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 key = PyUnicode_EncodeUTF8(keystart,
7578 keylen,
7579 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007580#else
7581 key = PyUnicode_FromUnicode(keystart, keylen);
7582#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 if (key == NULL)
7584 goto onError;
7585 if (args_owned) {
7586 Py_DECREF(args);
7587 args_owned = 0;
7588 }
7589 args = PyObject_GetItem(dict, key);
7590 Py_DECREF(key);
7591 if (args == NULL) {
7592 goto onError;
7593 }
7594 args_owned = 1;
7595 arglen = -1;
7596 argidx = -2;
7597 }
7598 while (--fmtcnt >= 0) {
7599 switch (c = *fmt++) {
7600 case '-': flags |= F_LJUST; continue;
7601 case '+': flags |= F_SIGN; continue;
7602 case ' ': flags |= F_BLANK; continue;
7603 case '#': flags |= F_ALT; continue;
7604 case '0': flags |= F_ZERO; continue;
7605 }
7606 break;
7607 }
7608 if (c == '*') {
7609 v = getnextarg(args, arglen, &argidx);
7610 if (v == NULL)
7611 goto onError;
7612 if (!PyInt_Check(v)) {
7613 PyErr_SetString(PyExc_TypeError,
7614 "* wants int");
7615 goto onError;
7616 }
7617 width = PyInt_AsLong(v);
7618 if (width < 0) {
7619 flags |= F_LJUST;
7620 width = -width;
7621 }
7622 if (--fmtcnt >= 0)
7623 c = *fmt++;
7624 }
7625 else if (c >= '0' && c <= '9') {
7626 width = c - '0';
7627 while (--fmtcnt >= 0) {
7628 c = *fmt++;
7629 if (c < '0' || c > '9')
7630 break;
7631 if ((width*10) / 10 != width) {
7632 PyErr_SetString(PyExc_ValueError,
7633 "width too big");
7634 goto onError;
7635 }
7636 width = width*10 + (c - '0');
7637 }
7638 }
7639 if (c == '.') {
7640 prec = 0;
7641 if (--fmtcnt >= 0)
7642 c = *fmt++;
7643 if (c == '*') {
7644 v = getnextarg(args, arglen, &argidx);
7645 if (v == NULL)
7646 goto onError;
7647 if (!PyInt_Check(v)) {
7648 PyErr_SetString(PyExc_TypeError,
7649 "* wants int");
7650 goto onError;
7651 }
7652 prec = PyInt_AsLong(v);
7653 if (prec < 0)
7654 prec = 0;
7655 if (--fmtcnt >= 0)
7656 c = *fmt++;
7657 }
7658 else if (c >= '0' && c <= '9') {
7659 prec = c - '0';
7660 while (--fmtcnt >= 0) {
7661 c = Py_CHARMASK(*fmt++);
7662 if (c < '0' || c > '9')
7663 break;
7664 if ((prec*10) / 10 != prec) {
7665 PyErr_SetString(PyExc_ValueError,
7666 "prec too big");
7667 goto onError;
7668 }
7669 prec = prec*10 + (c - '0');
7670 }
7671 }
7672 } /* prec */
7673 if (fmtcnt >= 0) {
7674 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675 if (--fmtcnt >= 0)
7676 c = *fmt++;
7677 }
7678 }
7679 if (fmtcnt < 0) {
7680 PyErr_SetString(PyExc_ValueError,
7681 "incomplete format");
7682 goto onError;
7683 }
7684 if (c != '%') {
7685 v = getnextarg(args, arglen, &argidx);
7686 if (v == NULL)
7687 goto onError;
7688 }
7689 sign = 0;
7690 fill = ' ';
7691 switch (c) {
7692
7693 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007694 pbuf = formatbuf;
7695 /* presume that buffer length is at least 1 */
7696 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697 len = 1;
7698 break;
7699
7700 case 's':
7701 case 'r':
7702 if (PyUnicode_Check(v) && c == 's') {
7703 temp = v;
7704 Py_INCREF(temp);
7705 }
7706 else {
7707 PyObject *unicode;
7708 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007709 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 else
7711 temp = PyObject_Repr(v);
7712 if (temp == NULL)
7713 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007714 if (PyUnicode_Check(temp))
7715 /* nothing to do */;
7716 else if (PyString_Check(temp)) {
7717 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007718 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007720 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007722 Py_DECREF(temp);
7723 temp = unicode;
7724 if (temp == NULL)
7725 goto onError;
7726 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007727 else {
7728 Py_DECREF(temp);
7729 PyErr_SetString(PyExc_TypeError,
7730 "%s argument has non-string str()");
7731 goto onError;
7732 }
7733 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007734 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 len = PyUnicode_GET_SIZE(temp);
7736 if (prec >= 0 && len > prec)
7737 len = prec;
7738 break;
7739
7740 case 'i':
7741 case 'd':
7742 case 'u':
7743 case 'o':
7744 case 'x':
7745 case 'X':
7746 if (c == 'i')
7747 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007748 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007749 temp = formatlong(v, flags, prec, c);
7750 if (!temp)
7751 goto onError;
7752 pbuf = PyUnicode_AS_UNICODE(temp);
7753 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007754 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007756 else {
7757 pbuf = formatbuf;
7758 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7759 flags, prec, c, v);
7760 if (len < 0)
7761 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007762 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007763 }
7764 if (flags & F_ZERO)
7765 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 break;
7767
7768 case 'e':
7769 case 'E':
7770 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007771 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 case 'g':
7773 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007774 if (c == 'F')
7775 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007776 pbuf = formatbuf;
7777 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7778 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779 if (len < 0)
7780 goto onError;
7781 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007782 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783 fill = '0';
7784 break;
7785
7786 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007787 pbuf = formatbuf;
7788 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 if (len < 0)
7790 goto onError;
7791 break;
7792
7793 default:
7794 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007795 "unsupported format character '%c' (0x%x) "
Armin Rigo4b63c212006-10-04 11:44:06 +00007796 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007797 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007798 (int)c,
Armin Rigo4b63c212006-10-04 11:44:06 +00007799 (Py_ssize_t)(fmt - 1 -
7800 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 goto onError;
7802 }
7803 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007804 if (*pbuf == '-' || *pbuf == '+') {
7805 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806 len--;
7807 }
7808 else if (flags & F_SIGN)
7809 sign = '+';
7810 else if (flags & F_BLANK)
7811 sign = ' ';
7812 else
7813 sign = 0;
7814 }
7815 if (width < len)
7816 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007817 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818 reslen -= rescnt;
7819 rescnt = width + fmtcnt + 100;
7820 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007821 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007822 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007823 PyErr_NoMemory();
7824 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007825 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007826 if (_PyUnicode_Resize(&result, reslen) < 0) {
7827 Py_XDECREF(temp);
7828 goto onError;
7829 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 res = PyUnicode_AS_UNICODE(result)
7831 + reslen - rescnt;
7832 }
7833 if (sign) {
7834 if (fill != ' ')
7835 *res++ = sign;
7836 rescnt--;
7837 if (width > len)
7838 width--;
7839 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007840 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7841 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007842 assert(pbuf[1] == c);
7843 if (fill != ' ') {
7844 *res++ = *pbuf++;
7845 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007846 }
Tim Petersfff53252001-04-12 18:38:48 +00007847 rescnt -= 2;
7848 width -= 2;
7849 if (width < 0)
7850 width = 0;
7851 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007852 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853 if (width > len && !(flags & F_LJUST)) {
7854 do {
7855 --rescnt;
7856 *res++ = fill;
7857 } while (--width > len);
7858 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007859 if (fill == ' ') {
7860 if (sign)
7861 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007862 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007863 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007864 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007865 *res++ = *pbuf++;
7866 *res++ = *pbuf++;
7867 }
7868 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007869 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 res += len;
7871 rescnt -= len;
7872 while (--width >= len) {
7873 --rescnt;
7874 *res++ = ' ';
7875 }
7876 if (dict && (argidx < arglen) && c != '%') {
7877 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007878 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007879 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880 goto onError;
7881 }
7882 Py_XDECREF(temp);
7883 } /* '%' */
7884 } /* until end */
7885 if (argidx < arglen && !dict) {
7886 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007887 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888 goto onError;
7889 }
7890
Thomas Woutersa96affe2006-03-12 00:29:36 +00007891 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7892 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893 if (args_owned) {
7894 Py_DECREF(args);
7895 }
7896 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897 return (PyObject *)result;
7898
7899 onError:
7900 Py_XDECREF(result);
7901 Py_DECREF(uformat);
7902 if (args_owned) {
7903 Py_DECREF(args);
7904 }
7905 return NULL;
7906}
7907
7908static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007909 (readbufferproc) unicode_buffer_getreadbuf,
7910 (writebufferproc) unicode_buffer_getwritebuf,
7911 (segcountproc) unicode_buffer_getsegcount,
7912 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913};
7914
Jeremy Hylton938ace62002-07-17 16:30:39 +00007915static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007916unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7917
Tim Peters6d6c1a32001-08-02 04:15:00 +00007918static PyObject *
7919unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7920{
7921 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007922 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007923 char *encoding = NULL;
7924 char *errors = NULL;
7925
Guido van Rossume023fe02001-08-30 03:12:59 +00007926 if (type != &PyUnicode_Type)
7927 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007928 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7929 kwlist, &x, &encoding, &errors))
7930 return NULL;
7931 if (x == NULL)
7932 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007933 if (encoding == NULL && errors == NULL)
7934 return PyObject_Unicode(x);
7935 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007936 return PyUnicode_FromEncodedObject(x, encoding, errors);
7937}
7938
Guido van Rossume023fe02001-08-30 03:12:59 +00007939static PyObject *
7940unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7941{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007942 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007943 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007944
7945 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7946 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7947 if (tmp == NULL)
7948 return NULL;
7949 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007950 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007951 if (pnew == NULL) {
7952 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007953 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007954 }
Neal Norwitzb3635f92008-03-18 04:17:36 +00007955 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007956 if (pnew->str == NULL) {
7957 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007958 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007959 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007960 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007961 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007962 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7963 pnew->length = n;
7964 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007965 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007966 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007967}
7968
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007969PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007970"unicode(string [, encoding[, errors]]) -> object\n\
7971\n\
7972Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007973encoding defaults to the current default string encoding.\n\
7974errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007975
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976PyTypeObject PyUnicode_Type = {
7977 PyObject_HEAD_INIT(&PyType_Type)
7978 0, /* ob_size */
7979 "unicode", /* tp_name */
7980 sizeof(PyUnicodeObject), /* tp_size */
7981 0, /* tp_itemsize */
7982 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007983 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007985 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00007987 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007988 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007989 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007991 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992 (hashfunc) unicode_hash, /* tp_hash*/
7993 0, /* tp_call*/
7994 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007995 PyObject_GenericGetAttr, /* tp_getattro */
7996 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007998 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7999 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008000 unicode_doc, /* tp_doc */
8001 0, /* tp_traverse */
8002 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008003 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008004 0, /* tp_weaklistoffset */
8005 0, /* tp_iter */
8006 0, /* tp_iternext */
8007 unicode_methods, /* tp_methods */
8008 0, /* tp_members */
8009 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008010 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008011 0, /* tp_dict */
8012 0, /* tp_descr_get */
8013 0, /* tp_descr_set */
8014 0, /* tp_dictoffset */
8015 0, /* tp_init */
8016 0, /* tp_alloc */
8017 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008018 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019};
8020
8021/* Initialize the Unicode implementation */
8022
Thomas Wouters78890102000-07-22 19:25:51 +00008023void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008025 int i;
8026
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008027 /* XXX - move this array to unicodectype.c ? */
8028 Py_UNICODE linebreak[] = {
8029 0x000A, /* LINE FEED */
8030 0x000D, /* CARRIAGE RETURN */
8031 0x001C, /* FILE SEPARATOR */
8032 0x001D, /* GROUP SEPARATOR */
8033 0x001E, /* RECORD SEPARATOR */
8034 0x0085, /* NEXT LINE */
8035 0x2028, /* LINE SEPARATOR */
8036 0x2029, /* PARAGRAPH SEPARATOR */
8037 };
8038
Fred Drakee4315f52000-05-09 19:53:39 +00008039 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008040 unicode_freelist = NULL;
8041 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008043 if (!unicode_empty)
8044 return;
8045
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008046 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008047 for (i = 0; i < 256; i++)
8048 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008049 if (PyType_Ready(&PyUnicode_Type) < 0)
8050 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008051
8052 /* initialize the linebreak bloom filter */
8053 bloom_linebreak = make_bloom_mask(
8054 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8055 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008056
8057 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058}
8059
8060/* Finalize the Unicode implementation */
8061
8062void
Thomas Wouters78890102000-07-22 19:25:51 +00008063_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008065 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008066 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008068 Py_XDECREF(unicode_empty);
8069 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008070
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008071 for (i = 0; i < 256; i++) {
8072 if (unicode_latin1[i]) {
8073 Py_DECREF(unicode_latin1[i]);
8074 unicode_latin1[i] = NULL;
8075 }
8076 }
8077
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008078 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 PyUnicodeObject *v = u;
8080 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008081 if (v->str)
Neal Norwitzb3635f92008-03-18 04:17:36 +00008082 PyObject_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008083 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008084 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008086 unicode_freelist = NULL;
8087 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008089
Anthony Baxterac6bd462006-04-13 02:06:09 +00008090#ifdef __cplusplus
8091}
8092#endif
8093
8094
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008095/*
8096Local variables:
8097c-basic-offset: 4
8098indent-tabs-mode: nil
8099End:
8100*/