blob: e2f1ed323d48e69199efb100d82512fe6c8a885e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
Neal Norwitzb3635f92008-03-18 04:17:36 +0000203 unicode->str = PyObject_REALLOC(unicode->str,
204 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000206 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 PyErr_NoMemory();
208 return -1;
209 }
210 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000211 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000213 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000215 if (unicode->defenc) {
216 Py_DECREF(unicode->defenc);
217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 }
219 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000220
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 return 0;
222}
223
224/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000225 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226
227 XXX This allocator could further be enhanced by assuring that the
228 free list never reduces its size below 1.
229
230*/
231
232static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000233PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234{
235 register PyUnicodeObject *unicode;
236
Andrew Dalkee0df7622006-05-27 11:04:36 +0000237 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238 if (length == 0 && unicode_empty != NULL) {
239 Py_INCREF(unicode_empty);
240 return unicode_empty;
241 }
242
243 /* Unicode freelist & memory allocation */
244 if (unicode_freelist) {
245 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000246 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000248 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000249 /* Keep-Alive optimization: we only upsize the buffer,
250 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000251 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000252 unicode_resize(unicode, length) < 0) {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000253 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000254 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 }
256 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000257 else {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000258 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
259 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000260 }
261 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 }
263 else {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000264 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000265 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode == NULL)
267 return NULL;
Neal Norwitzb3635f92008-03-18 04:17:36 +0000268 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
269 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 }
271
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000272 if (!unicode->str) {
273 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000274 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000275 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000276 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000277 * the caller fails before initializing str -- unicode_resize()
278 * reads str[0], and the Keep-Alive optimization can keep memory
279 * allocated for str alive across a call to unicode_dealloc(unicode).
280 * We don't want unicode_resize to read uninitialized memory in
281 * that case.
282 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000283 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000285 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289
290 onError:
291 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000292 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294}
295
296static
Guido van Rossum9475a232001-10-05 20:51:39 +0000297void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000299 if (PyUnicode_CheckExact(unicode) &&
300 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000301 /* Keep-Alive optimization */
302 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000303 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 unicode->str = NULL;
305 unicode->length = 0;
306 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000307 if (unicode->defenc) {
308 Py_DECREF(unicode->defenc);
309 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000310 }
311 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 *(PyUnicodeObject **)unicode = unicode_freelist;
313 unicode_freelist = unicode;
314 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 }
316 else {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000317 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000318 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000319 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 }
321}
322
Martin v. Löwis18e16552006-02-15 17:27:45 +0000323int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000324{
325 register PyUnicodeObject *v;
326
327 /* Argument checks */
328 if (unicode == NULL) {
329 PyErr_BadInternalCall();
330 return -1;
331 }
332 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000333 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000334 PyErr_BadInternalCall();
335 return -1;
336 }
337
338 /* Resizing unicode_empty and single character objects is not
339 possible since these are being shared. We simply return a fresh
340 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000341 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 (v == unicode_empty || v->length == 1)) {
343 PyUnicodeObject *w = _PyUnicode_New(length);
344 if (w == NULL)
345 return -1;
346 Py_UNICODE_COPY(w->str, v->str,
347 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000348 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000349 *unicode = (PyObject *)w;
350 return 0;
351 }
352
353 /* Note that we don't have to modify *unicode for unshared Unicode
354 objects, since we can modify them in-place. */
355 return unicode_resize(v, length);
356}
357
358/* Internal API for use in unicodeobject.c only ! */
359#define _PyUnicode_Resize(unicodevar, length) \
360 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
361
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000363 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364{
365 PyUnicodeObject *unicode;
366
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000367 /* If the Unicode data is known at construction time, we can apply
368 some optimizations which share commonly used objects. */
369 if (u != NULL) {
370
371 /* Optimization for empty strings */
372 if (size == 0 && unicode_empty != NULL) {
373 Py_INCREF(unicode_empty);
374 return (PyObject *)unicode_empty;
375 }
376
377 /* Single character Unicode objects in the Latin-1 range are
378 shared when using this constructor */
379 if (size == 1 && *u < 256) {
380 unicode = unicode_latin1[*u];
381 if (!unicode) {
382 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000383 if (!unicode)
384 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000385 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000386 unicode_latin1[*u] = unicode;
387 }
388 Py_INCREF(unicode);
389 return (PyObject *)unicode;
390 }
391 }
Tim Petersced69f82003-09-16 20:30:58 +0000392
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393 unicode = _PyUnicode_New(size);
394 if (!unicode)
395 return NULL;
396
397 /* Copy the Unicode data into the new object */
398 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000399 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400
401 return (PyObject *)unicode;
402}
403
404#ifdef HAVE_WCHAR_H
405
406PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000407 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408{
409 PyUnicodeObject *unicode;
410
411 if (w == NULL) {
412 PyErr_BadInternalCall();
413 return NULL;
414 }
415
416 unicode = _PyUnicode_New(size);
417 if (!unicode)
418 return NULL;
419
420 /* Copy the wchar_t data into the new object */
421#ifdef HAVE_USABLE_WCHAR_T
422 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000423#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000424 {
425 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000426 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000428 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429 *u++ = *w++;
430 }
431#endif
432
433 return (PyObject *)unicode;
434}
435
Martin v. Löwis18e16552006-02-15 17:27:45 +0000436Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
437 wchar_t *w,
438 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000439{
440 if (unicode == NULL) {
441 PyErr_BadInternalCall();
442 return -1;
443 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000444
445 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000446 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000447 size = PyUnicode_GET_SIZE(unicode) + 1;
448
Guido van Rossumd57fd912000-03-10 22:53:23 +0000449#ifdef HAVE_USABLE_WCHAR_T
450 memcpy(w, unicode->str, size * sizeof(wchar_t));
451#else
452 {
453 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000454 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000456 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457 *w++ = *u++;
458 }
459#endif
460
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000461 if (size > PyUnicode_GET_SIZE(unicode))
462 return PyUnicode_GET_SIZE(unicode);
463 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464 return size;
465}
466
467#endif
468
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000469PyObject *PyUnicode_FromOrdinal(int ordinal)
470{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000471 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000472
473#ifdef Py_UNICODE_WIDE
474 if (ordinal < 0 || ordinal > 0x10ffff) {
475 PyErr_SetString(PyExc_ValueError,
476 "unichr() arg not in range(0x110000) "
477 "(wide Python build)");
478 return NULL;
479 }
480#else
481 if (ordinal < 0 || ordinal > 0xffff) {
482 PyErr_SetString(PyExc_ValueError,
483 "unichr() arg not in range(0x10000) "
484 "(narrow Python build)");
485 return NULL;
486 }
487#endif
488
Hye-Shik Chang40574832004-04-06 07:24:51 +0000489 s[0] = (Py_UNICODE)ordinal;
490 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000491}
492
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493PyObject *PyUnicode_FromObject(register PyObject *obj)
494{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000495 /* XXX Perhaps we should make this API an alias of
496 PyObject_Unicode() instead ?! */
497 if (PyUnicode_CheckExact(obj)) {
498 Py_INCREF(obj);
499 return obj;
500 }
501 if (PyUnicode_Check(obj)) {
502 /* For a Unicode subtype that's not a Unicode object,
503 return a true Unicode object with the same data. */
504 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
505 PyUnicode_GET_SIZE(obj));
506 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000507 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
508}
509
510PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
511 const char *encoding,
512 const char *errors)
513{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000514 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000515 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000517
Guido van Rossumd57fd912000-03-10 22:53:23 +0000518 if (obj == NULL) {
519 PyErr_BadInternalCall();
520 return NULL;
521 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000522
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000523#if 0
524 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000525 that no encodings is given and then redirect to
526 PyObject_Unicode() which then applies the additional logic for
527 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000528
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000529 NOTE: This API should really only be used for object which
530 represent *encoded* Unicode !
531
532 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000533 if (PyUnicode_Check(obj)) {
534 if (encoding) {
535 PyErr_SetString(PyExc_TypeError,
536 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000538 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000539 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000540 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000541#else
542 if (PyUnicode_Check(obj)) {
543 PyErr_SetString(PyExc_TypeError,
544 "decoding Unicode is not supported");
545 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000546 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000547#endif
548
549 /* Coerce object */
550 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000551 s = PyString_AS_STRING(obj);
552 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000553 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000554 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
555 /* Overwrite the error message with something more useful in
556 case of a TypeError. */
557 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000558 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000559 "coercing to Unicode: need string or buffer, "
560 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000561 obj->ob_type->tp_name);
562 goto onError;
563 }
Tim Petersced69f82003-09-16 20:30:58 +0000564
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000565 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566 if (len == 0) {
567 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000568 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000569 }
Tim Petersced69f82003-09-16 20:30:58 +0000570 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000571 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000572
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000573 return v;
574
575 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000576 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577}
578
579PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000580 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581 const char *encoding,
582 const char *errors)
583{
584 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000585
586 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000587 encoding = PyUnicode_GetDefaultEncoding();
588
589 /* Shortcuts for common default encodings */
590 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000592 else if (strcmp(encoding, "latin-1") == 0)
593 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000594#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
595 else if (strcmp(encoding, "mbcs") == 0)
596 return PyUnicode_DecodeMBCS(s, size, errors);
597#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000598 else if (strcmp(encoding, "ascii") == 0)
599 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000600
601 /* Decode via the codec registry */
602 buffer = PyBuffer_FromMemory((void *)s, size);
603 if (buffer == NULL)
604 goto onError;
605 unicode = PyCodec_Decode(buffer, encoding, errors);
606 if (unicode == NULL)
607 goto onError;
608 if (!PyUnicode_Check(unicode)) {
609 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000610 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000611 unicode->ob_type->tp_name);
612 Py_DECREF(unicode);
613 goto onError;
614 }
615 Py_DECREF(buffer);
616 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000617
Guido van Rossumd57fd912000-03-10 22:53:23 +0000618 onError:
619 Py_XDECREF(buffer);
620 return NULL;
621}
622
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000623PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
624 const char *encoding,
625 const char *errors)
626{
627 PyObject *v;
628
629 if (!PyUnicode_Check(unicode)) {
630 PyErr_BadArgument();
631 goto onError;
632 }
633
634 if (encoding == NULL)
635 encoding = PyUnicode_GetDefaultEncoding();
636
637 /* Decode via the codec registry */
638 v = PyCodec_Decode(unicode, encoding, errors);
639 if (v == NULL)
640 goto onError;
641 return v;
642
643 onError:
644 return NULL;
645}
646
Guido van Rossumd57fd912000-03-10 22:53:23 +0000647PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000648 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649 const char *encoding,
650 const char *errors)
651{
652 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000653
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 unicode = PyUnicode_FromUnicode(s, size);
655 if (unicode == NULL)
656 return NULL;
657 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
658 Py_DECREF(unicode);
659 return v;
660}
661
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000662PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
663 const char *encoding,
664 const char *errors)
665{
666 PyObject *v;
667
668 if (!PyUnicode_Check(unicode)) {
669 PyErr_BadArgument();
670 goto onError;
671 }
672
673 if (encoding == NULL)
674 encoding = PyUnicode_GetDefaultEncoding();
675
676 /* Encode via the codec registry */
677 v = PyCodec_Encode(unicode, encoding, errors);
678 if (v == NULL)
679 goto onError;
680 return v;
681
682 onError:
683 return NULL;
684}
685
Guido van Rossumd57fd912000-03-10 22:53:23 +0000686PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
687 const char *encoding,
688 const char *errors)
689{
690 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000691
Guido van Rossumd57fd912000-03-10 22:53:23 +0000692 if (!PyUnicode_Check(unicode)) {
693 PyErr_BadArgument();
694 goto onError;
695 }
Fred Drakee4315f52000-05-09 19:53:39 +0000696
Tim Petersced69f82003-09-16 20:30:58 +0000697 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000698 encoding = PyUnicode_GetDefaultEncoding();
699
700 /* Shortcuts for common default encodings */
701 if (errors == NULL) {
702 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000703 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000704 else if (strcmp(encoding, "latin-1") == 0)
705 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000706#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
707 else if (strcmp(encoding, "mbcs") == 0)
708 return PyUnicode_AsMBCSString(unicode);
709#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000710 else if (strcmp(encoding, "ascii") == 0)
711 return PyUnicode_AsASCIIString(unicode);
712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000713
714 /* Encode via the codec registry */
715 v = PyCodec_Encode(unicode, encoding, errors);
716 if (v == NULL)
717 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718 if (!PyString_Check(v)) {
719 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000720 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000721 v->ob_type->tp_name);
722 Py_DECREF(v);
723 goto onError;
724 }
725 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000726
Guido van Rossumd57fd912000-03-10 22:53:23 +0000727 onError:
728 return NULL;
729}
730
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000731PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
732 const char *errors)
733{
734 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
735
736 if (v)
737 return v;
738 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
739 if (v && errors == NULL)
740 ((PyUnicodeObject *)unicode)->defenc = v;
741 return v;
742}
743
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
745{
746 if (!PyUnicode_Check(unicode)) {
747 PyErr_BadArgument();
748 goto onError;
749 }
750 return PyUnicode_AS_UNICODE(unicode);
751
752 onError:
753 return NULL;
754}
755
Martin v. Löwis18e16552006-02-15 17:27:45 +0000756Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757{
758 if (!PyUnicode_Check(unicode)) {
759 PyErr_BadArgument();
760 goto onError;
761 }
762 return PyUnicode_GET_SIZE(unicode);
763
764 onError:
765 return -1;
766}
767
Thomas Wouters78890102000-07-22 19:25:51 +0000768const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000769{
770 return unicode_default_encoding;
771}
772
773int PyUnicode_SetDefaultEncoding(const char *encoding)
774{
775 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000776
Fred Drakee4315f52000-05-09 19:53:39 +0000777 /* Make sure the encoding is valid. As side effect, this also
778 loads the encoding into the codec registry cache. */
779 v = _PyCodec_Lookup(encoding);
780 if (v == NULL)
781 goto onError;
782 Py_DECREF(v);
783 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000784 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000785 sizeof(unicode_default_encoding));
786 return 0;
787
788 onError:
789 return -1;
790}
791
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000792/* error handling callback helper:
793 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000794 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000795 and adjust various state variables.
796 return 0 on success, -1 on error
797*/
798
799static
800int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
801 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000802 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
803 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000804{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000805 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000806
807 PyObject *restuple = NULL;
808 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000809 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
810 Py_ssize_t requiredsize;
811 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000812 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000813 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000814 int res = -1;
815
816 if (*errorHandler == NULL) {
817 *errorHandler = PyCodec_LookupError(errors);
818 if (*errorHandler == NULL)
819 goto onError;
820 }
821
822 if (*exceptionObject == NULL) {
823 *exceptionObject = PyUnicodeDecodeError_Create(
824 encoding, input, insize, *startinpos, *endinpos, reason);
825 if (*exceptionObject == NULL)
826 goto onError;
827 }
828 else {
829 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
830 goto onError;
831 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
832 goto onError;
833 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
834 goto onError;
835 }
836
837 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
838 if (restuple == NULL)
839 goto onError;
840 if (!PyTuple_Check(restuple)) {
841 PyErr_Format(PyExc_TypeError, &argparse[4]);
842 goto onError;
843 }
844 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
845 goto onError;
846 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000847 newpos = insize+newpos;
848 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000849 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000850 goto onError;
851 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000852
853 /* need more space? (at least enough for what we
854 have+the replacement+the rest of the string (starting
855 at the new input position), so we won't have to check space
856 when there are no errors in the rest of the string) */
857 repptr = PyUnicode_AS_UNICODE(repunicode);
858 repsize = PyUnicode_GET_SIZE(repunicode);
859 requiredsize = *outpos + repsize + insize-newpos;
860 if (requiredsize > outsize) {
861 if (requiredsize<2*outsize)
862 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000863 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000864 goto onError;
865 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
866 }
867 *endinpos = newpos;
868 *inptr = input + newpos;
869 Py_UNICODE_COPY(*outptr, repptr, repsize);
870 *outptr += repsize;
871 *outpos += repsize;
872 /* we made it! */
873 res = 0;
874
875 onError:
876 Py_XDECREF(restuple);
877 return res;
878}
879
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000880/* --- UTF-7 Codec -------------------------------------------------------- */
881
882/* see RFC2152 for details */
883
Tim Petersced69f82003-09-16 20:30:58 +0000884static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000885char utf7_special[128] = {
886 /* indicate whether a UTF-7 character is special i.e. cannot be directly
887 encoded:
888 0 - not special
889 1 - special
890 2 - whitespace (optional)
891 3 - RFC2152 Set O (optional) */
892 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
893 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
894 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
896 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
898 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
900
901};
902
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000903/* Note: The comparison (c) <= 0 is a trick to work-around gcc
904 warnings about the comparison always being false; since
905 utf7_special[0] is 1, we can safely make that one comparison
906 true */
907
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000909 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000910 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000911 (encodeO && (utf7_special[(c)] == 3)))
912
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000913#define B64(n) \
914 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
915#define B64CHAR(c) \
916 (isalnum(c) || (c) == '+' || (c) == '/')
917#define UB64(c) \
918 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
919 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000920
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000921#define ENCODE(out, ch, bits) \
922 while (bits >= 6) { \
923 *out++ = B64(ch >> (bits-6)); \
924 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000925 }
926
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000927#define DECODE(out, ch, bits, surrogate) \
928 while (bits >= 16) { \
929 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
930 bits -= 16; \
931 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000932 /* We have already generated an error for the high surrogate \
933 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000934 surrogate = 0; \
935 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000936 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000937 it in a 16-bit character */ \
938 surrogate = 1; \
939 errmsg = "code pairs are not supported"; \
940 goto utf7Error; \
941 } else { \
942 *out++ = outCh; \
943 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000944 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000947 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000948 const char *errors)
949{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000950 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000951 Py_ssize_t startinpos;
952 Py_ssize_t endinpos;
953 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000954 const char *e;
955 PyUnicodeObject *unicode;
956 Py_UNICODE *p;
957 const char *errmsg = "";
958 int inShift = 0;
959 unsigned int bitsleft = 0;
960 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000961 int surrogate = 0;
962 PyObject *errorHandler = NULL;
963 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000964
965 unicode = _PyUnicode_New(size);
966 if (!unicode)
967 return NULL;
968 if (size == 0)
969 return (PyObject *)unicode;
970
971 p = unicode->str;
972 e = s + size;
973
974 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000975 Py_UNICODE ch;
976 restart:
977 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000978
979 if (inShift) {
980 if ((ch == '-') || !B64CHAR(ch)) {
981 inShift = 0;
982 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000983
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000984 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
985 if (bitsleft >= 6) {
986 /* The shift sequence has a partial character in it. If
987 bitsleft < 6 then we could just classify it as padding
988 but that is not the case here */
989
990 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000991 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000992 }
993 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000994 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000995 here so indicate the potential of a misencoded character. */
996
997 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
998 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
999 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001000 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001001 }
1002
1003 if (ch == '-') {
1004 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001005 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001006 inShift = 1;
1007 }
1008 } else if (SPECIAL(ch,0,0)) {
1009 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001010 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 } else {
1012 *p++ = ch;
1013 }
1014 } else {
1015 charsleft = (charsleft << 6) | UB64(ch);
1016 bitsleft += 6;
1017 s++;
1018 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1019 }
1020 }
1021 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001022 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001023 s++;
1024 if (s < e && *s == '-') {
1025 s++;
1026 *p++ = '+';
1027 } else
1028 {
1029 inShift = 1;
1030 bitsleft = 0;
1031 }
1032 }
1033 else if (SPECIAL(ch,0,0)) {
1034 errmsg = "unexpected special character";
1035 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001036 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001037 }
1038 else {
1039 *p++ = ch;
1040 s++;
1041 }
1042 continue;
1043 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001044 outpos = p-PyUnicode_AS_UNICODE(unicode);
1045 endinpos = s-starts;
1046 if (unicode_decode_call_errorhandler(
1047 errors, &errorHandler,
1048 "utf7", errmsg,
1049 starts, size, &startinpos, &endinpos, &exc, &s,
1050 (PyObject **)&unicode, &outpos, &p))
1051 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001052 }
1053
1054 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001055 outpos = p-PyUnicode_AS_UNICODE(unicode);
1056 endinpos = size;
1057 if (unicode_decode_call_errorhandler(
1058 errors, &errorHandler,
1059 "utf7", "unterminated shift sequence",
1060 starts, size, &startinpos, &endinpos, &exc, &s,
1061 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001062 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001063 if (s < e)
1064 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001065 }
1066
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001067 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 goto onError;
1069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001070 Py_XDECREF(errorHandler);
1071 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001072 return (PyObject *)unicode;
1073
1074onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001075 Py_XDECREF(errorHandler);
1076 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001077 Py_DECREF(unicode);
1078 return NULL;
1079}
1080
1081
1082PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001083 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001084 int encodeSetO,
1085 int encodeWhiteSpace,
1086 const char *errors)
1087{
1088 PyObject *v;
1089 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001090 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001091 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001092 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 unsigned int bitsleft = 0;
1094 unsigned long charsleft = 0;
1095 char * out;
1096 char * start;
1097
1098 if (size == 0)
1099 return PyString_FromStringAndSize(NULL, 0);
1100
1101 v = PyString_FromStringAndSize(NULL, cbAllocated);
1102 if (v == NULL)
1103 return NULL;
1104
1105 start = out = PyString_AS_STRING(v);
1106 for (;i < size; ++i) {
1107 Py_UNICODE ch = s[i];
1108
1109 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001110 if (ch == '+') {
1111 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001112 *out++ = '-';
1113 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1114 charsleft = ch;
1115 bitsleft = 16;
1116 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001117 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001118 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001119 } else {
1120 *out++ = (char) ch;
1121 }
1122 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001123 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1124 *out++ = B64(charsleft << (6-bitsleft));
1125 charsleft = 0;
1126 bitsleft = 0;
1127 /* Characters not in the BASE64 set implicitly unshift the sequence
1128 so no '-' is required, except if the character is itself a '-' */
1129 if (B64CHAR(ch) || ch == '-') {
1130 *out++ = '-';
1131 }
1132 inShift = 0;
1133 *out++ = (char) ch;
1134 } else {
1135 bitsleft += 16;
1136 charsleft = (charsleft << 16) | ch;
1137 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1138
1139 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001140 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001141 or '-' then the shift sequence will be terminated implicitly and we
1142 don't have to insert a '-'. */
1143
1144 if (bitsleft == 0) {
1145 if (i + 1 < size) {
1146 Py_UNICODE ch2 = s[i+1];
1147
1148 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001149
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001150 } else if (B64CHAR(ch2) || ch2 == '-') {
1151 *out++ = '-';
1152 inShift = 0;
1153 } else {
1154 inShift = 0;
1155 }
1156
1157 }
1158 else {
1159 *out++ = '-';
1160 inShift = 0;
1161 }
1162 }
Tim Petersced69f82003-09-16 20:30:58 +00001163 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001164 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001165 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001166 if (bitsleft) {
1167 *out++= B64(charsleft << (6-bitsleft) );
1168 *out++ = '-';
1169 }
1170
Tim Peters5de98422002-04-27 18:44:32 +00001171 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001172 return v;
1173}
1174
1175#undef SPECIAL
1176#undef B64
1177#undef B64CHAR
1178#undef UB64
1179#undef ENCODE
1180#undef DECODE
1181
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182/* --- UTF-8 Codec -------------------------------------------------------- */
1183
Tim Petersced69f82003-09-16 20:30:58 +00001184static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185char utf8_code_length[256] = {
1186 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1187 illegal prefix. see RFC 2279 for details */
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1193 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1194 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1195 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1200 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1201 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1202 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1203 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1204};
1205
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001207 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208 const char *errors)
1209{
Walter Dörwald69652032004-09-07 20:24:22 +00001210 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1211}
1212
1213PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001214 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001215 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001217{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001218 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001220 Py_ssize_t startinpos;
1221 Py_ssize_t endinpos;
1222 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 const char *e;
1224 PyUnicodeObject *unicode;
1225 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001226 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001227 PyObject *errorHandler = NULL;
1228 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229
1230 /* Note: size will always be longer than the resulting Unicode
1231 character count */
1232 unicode = _PyUnicode_New(size);
1233 if (!unicode)
1234 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001235 if (size == 0) {
1236 if (consumed)
1237 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240
1241 /* Unpack UTF-8 encoded data */
1242 p = unicode->str;
1243 e = s + size;
1244
1245 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001246 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247
1248 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001249 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 s++;
1251 continue;
1252 }
1253
1254 n = utf8_code_length[ch];
1255
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001256 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001257 if (consumed)
1258 break;
1259 else {
1260 errmsg = "unexpected end of data";
1261 startinpos = s-starts;
1262 endinpos = size;
1263 goto utf8Error;
1264 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266
1267 switch (n) {
1268
1269 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001270 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001271 startinpos = s-starts;
1272 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001273 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274
1275 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001276 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001277 startinpos = s-starts;
1278 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001279 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280
1281 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 if ((s[1] & 0xc0) != 0x80) {
1283 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001284 startinpos = s-starts;
1285 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001286 goto utf8Error;
1287 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001289 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001290 startinpos = s-starts;
1291 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 errmsg = "illegal encoding";
1293 goto utf8Error;
1294 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001296 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297 break;
1298
1299 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001300 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 (s[2] & 0xc0) != 0x80) {
1302 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001303 startinpos = s-starts;
1304 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001305 goto utf8Error;
1306 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001308 if (ch < 0x0800) {
1309 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001310 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001311
1312 XXX For wide builds (UCS-4) we should probably try
1313 to recombine the surrogates into a single code
1314 unit.
1315 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001317 startinpos = s-starts;
1318 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001319 goto utf8Error;
1320 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001322 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001323 break;
1324
1325 case 4:
1326 if ((s[1] & 0xc0) != 0x80 ||
1327 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 (s[3] & 0xc0) != 0x80) {
1329 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001330 startinpos = s-starts;
1331 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001332 goto utf8Error;
1333 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001334 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1335 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1336 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001338 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001339 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001340 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001341 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001342 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001343 startinpos = s-starts;
1344 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001345 goto utf8Error;
1346 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001347#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001348 *p++ = (Py_UNICODE)ch;
1349#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001350 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001351
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001352 /* translate from 10000..10FFFF to 0..FFFF */
1353 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001354
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001355 /* high surrogate = top 10 bits added to D800 */
1356 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001357
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001358 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001359 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001360#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 break;
1362
1363 default:
1364 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001365 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001366 startinpos = s-starts;
1367 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001368 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001369 }
1370 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001371 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001372
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001373 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001374 outpos = p-PyUnicode_AS_UNICODE(unicode);
1375 if (unicode_decode_call_errorhandler(
1376 errors, &errorHandler,
1377 "utf8", errmsg,
1378 starts, size, &startinpos, &endinpos, &exc, &s,
1379 (PyObject **)&unicode, &outpos, &p))
1380 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 }
Walter Dörwald69652032004-09-07 20:24:22 +00001382 if (consumed)
1383 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384
1385 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001386 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 goto onError;
1388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001389 Py_XDECREF(errorHandler);
1390 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391 return (PyObject *)unicode;
1392
1393onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001394 Py_XDECREF(errorHandler);
1395 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 Py_DECREF(unicode);
1397 return NULL;
1398}
1399
Tim Peters602f7402002-04-27 18:03:26 +00001400/* Allocation strategy: if the string is short, convert into a stack buffer
1401 and allocate exactly as much space needed at the end. Else allocate the
1402 maximum possible needed (4 result bytes per Unicode character), and return
1403 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001404*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001405PyObject *
1406PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001407 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001408 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409{
Tim Peters602f7402002-04-27 18:03:26 +00001410#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001411
Martin v. Löwis18e16552006-02-15 17:27:45 +00001412 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001413 PyObject *v; /* result string object */
1414 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001416 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001417 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001418
Tim Peters602f7402002-04-27 18:03:26 +00001419 assert(s != NULL);
1420 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421
Tim Peters602f7402002-04-27 18:03:26 +00001422 if (size <= MAX_SHORT_UNICHARS) {
1423 /* Write into the stack buffer; nallocated can't overflow.
1424 * At the end, we'll allocate exactly as much heap space as it
1425 * turns out we need.
1426 */
1427 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1428 v = NULL; /* will allocate after we're done */
1429 p = stackbuf;
1430 }
1431 else {
1432 /* Overallocate on the heap, and give the excess back at the end. */
1433 nallocated = size * 4;
1434 if (nallocated / 4 != size) /* overflow! */
1435 return PyErr_NoMemory();
1436 v = PyString_FromStringAndSize(NULL, nallocated);
1437 if (v == NULL)
1438 return NULL;
1439 p = PyString_AS_STRING(v);
1440 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441
Tim Peters602f7402002-04-27 18:03:26 +00001442 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001443 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001444
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001445 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001446 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001448
Guido van Rossumd57fd912000-03-10 22:53:23 +00001449 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001450 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001451 *p++ = (char)(0xc0 | (ch >> 6));
1452 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001453 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001454 else {
Tim Peters602f7402002-04-27 18:03:26 +00001455 /* Encode UCS2 Unicode ordinals */
1456 if (ch < 0x10000) {
1457 /* Special case: check for high surrogate */
1458 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1459 Py_UCS4 ch2 = s[i];
1460 /* Check for low surrogate and combine the two to
1461 form a UCS4 value */
1462 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001463 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001464 i++;
1465 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001466 }
Tim Peters602f7402002-04-27 18:03:26 +00001467 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001468 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001469 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001470 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1471 *p++ = (char)(0x80 | (ch & 0x3f));
1472 continue;
1473 }
1474encodeUCS4:
1475 /* Encode UCS4 Unicode ordinals */
1476 *p++ = (char)(0xf0 | (ch >> 18));
1477 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1478 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1479 *p++ = (char)(0x80 | (ch & 0x3f));
1480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001482
Tim Peters602f7402002-04-27 18:03:26 +00001483 if (v == NULL) {
1484 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001485 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001486 assert(nneeded <= nallocated);
1487 v = PyString_FromStringAndSize(stackbuf, nneeded);
1488 }
1489 else {
1490 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001491 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001492 assert(nneeded <= nallocated);
1493 _PyString_Resize(&v, nneeded);
1494 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001495 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001496
Tim Peters602f7402002-04-27 18:03:26 +00001497#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498}
1499
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1501{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502 if (!PyUnicode_Check(unicode)) {
1503 PyErr_BadArgument();
1504 return NULL;
1505 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001506 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1507 PyUnicode_GET_SIZE(unicode),
1508 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509}
1510
1511/* --- UTF-16 Codec ------------------------------------------------------- */
1512
Tim Peters772747b2001-08-09 22:21:55 +00001513PyObject *
1514PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001515 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001516 const char *errors,
1517 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518{
Walter Dörwald69652032004-09-07 20:24:22 +00001519 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1520}
1521
1522PyObject *
1523PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001524 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001525 const char *errors,
1526 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001527 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001528{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001529 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001530 Py_ssize_t startinpos;
1531 Py_ssize_t endinpos;
1532 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533 PyUnicodeObject *unicode;
1534 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001535 const unsigned char *q, *e;
1536 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001537 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001538 /* Offsets from q for retrieving byte pairs in the right order. */
1539#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1540 int ihi = 1, ilo = 0;
1541#else
1542 int ihi = 0, ilo = 1;
1543#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001544 PyObject *errorHandler = NULL;
1545 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001546
1547 /* Note: size will always be longer than the resulting Unicode
1548 character count */
1549 unicode = _PyUnicode_New(size);
1550 if (!unicode)
1551 return NULL;
1552 if (size == 0)
1553 return (PyObject *)unicode;
1554
1555 /* Unpack UTF-16 encoded data */
1556 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001557 q = (unsigned char *)s;
1558 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001559
1560 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001561 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001563 /* Check for BOM marks (U+FEFF) in the input and adjust current
1564 byte order setting accordingly. In native mode, the leading BOM
1565 mark is skipped, in all other modes, it is copied to the output
1566 stream as-is (giving a ZWNBSP character). */
1567 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001568 if (size >= 2) {
1569 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001570#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001571 if (bom == 0xFEFF) {
1572 q += 2;
1573 bo = -1;
1574 }
1575 else if (bom == 0xFFFE) {
1576 q += 2;
1577 bo = 1;
1578 }
Tim Petersced69f82003-09-16 20:30:58 +00001579#else
Walter Dörwald69652032004-09-07 20:24:22 +00001580 if (bom == 0xFEFF) {
1581 q += 2;
1582 bo = 1;
1583 }
1584 else if (bom == 0xFFFE) {
1585 q += 2;
1586 bo = -1;
1587 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001588#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001589 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591
Tim Peters772747b2001-08-09 22:21:55 +00001592 if (bo == -1) {
1593 /* force LE */
1594 ihi = 1;
1595 ilo = 0;
1596 }
1597 else if (bo == 1) {
1598 /* force BE */
1599 ihi = 0;
1600 ilo = 1;
1601 }
1602
1603 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001604 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001605 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001606 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001607 if (consumed)
1608 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001609 errmsg = "truncated data";
1610 startinpos = ((const char *)q)-starts;
1611 endinpos = ((const char *)e)-starts;
1612 goto utf16Error;
1613 /* The remaining input chars are ignored if the callback
1614 chooses to skip the input */
1615 }
1616 ch = (q[ihi] << 8) | q[ilo];
1617
Tim Peters772747b2001-08-09 22:21:55 +00001618 q += 2;
1619
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 if (ch < 0xD800 || ch > 0xDFFF) {
1621 *p++ = ch;
1622 continue;
1623 }
1624
1625 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001626 if (q >= e) {
1627 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 startinpos = (((const char *)q)-2)-starts;
1629 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001630 goto utf16Error;
1631 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001632 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001633 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1634 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001635 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001636#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001637 *p++ = ch;
1638 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001639#else
1640 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001641#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001642 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001643 }
1644 else {
1645 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646 startinpos = (((const char *)q)-4)-starts;
1647 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001648 goto utf16Error;
1649 }
1650
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001652 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001653 startinpos = (((const char *)q)-2)-starts;
1654 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001655 /* Fall through to report the error */
1656
1657 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001658 outpos = p-PyUnicode_AS_UNICODE(unicode);
1659 if (unicode_decode_call_errorhandler(
1660 errors, &errorHandler,
1661 "utf16", errmsg,
1662 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1663 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001664 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 }
1666
1667 if (byteorder)
1668 *byteorder = bo;
1669
Walter Dörwald69652032004-09-07 20:24:22 +00001670 if (consumed)
1671 *consumed = (const char *)q-starts;
1672
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001674 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 goto onError;
1676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001677 Py_XDECREF(errorHandler);
1678 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 return (PyObject *)unicode;
1680
1681onError:
1682 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001683 Py_XDECREF(errorHandler);
1684 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 return NULL;
1686}
1687
Tim Peters772747b2001-08-09 22:21:55 +00001688PyObject *
1689PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001690 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001691 const char *errors,
1692 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693{
1694 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001695 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001696#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001697 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001698#else
1699 const int pairs = 0;
1700#endif
Tim Peters772747b2001-08-09 22:21:55 +00001701 /* Offsets from p for storing byte pairs in the right order. */
1702#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1703 int ihi = 1, ilo = 0;
1704#else
1705 int ihi = 0, ilo = 1;
1706#endif
1707
1708#define STORECHAR(CH) \
1709 do { \
1710 p[ihi] = ((CH) >> 8) & 0xff; \
1711 p[ilo] = (CH) & 0xff; \
1712 p += 2; \
1713 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001715#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001716 for (i = pairs = 0; i < size; i++)
1717 if (s[i] >= 0x10000)
1718 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001719#endif
Tim Petersced69f82003-09-16 20:30:58 +00001720 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001721 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 if (v == NULL)
1723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724
Tim Peters772747b2001-08-09 22:21:55 +00001725 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001727 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001728 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001729 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001730
1731 if (byteorder == -1) {
1732 /* force LE */
1733 ihi = 1;
1734 ilo = 0;
1735 }
1736 else if (byteorder == 1) {
1737 /* force BE */
1738 ihi = 0;
1739 ilo = 1;
1740 }
1741
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001742 while (size-- > 0) {
1743 Py_UNICODE ch = *s++;
1744 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001745#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001746 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001747 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1748 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001750#endif
Tim Peters772747b2001-08-09 22:21:55 +00001751 STORECHAR(ch);
1752 if (ch2)
1753 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001756#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757}
1758
1759PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1760{
1761 if (!PyUnicode_Check(unicode)) {
1762 PyErr_BadArgument();
1763 return NULL;
1764 }
1765 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1766 PyUnicode_GET_SIZE(unicode),
1767 NULL,
1768 0);
1769}
1770
1771/* --- Unicode Escape Codec ----------------------------------------------- */
1772
Fredrik Lundh06d12682001-01-24 07:59:11 +00001773static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001774
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777 const char *errors)
1778{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001780 Py_ssize_t startinpos;
1781 Py_ssize_t endinpos;
1782 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001783 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001787 char* message;
1788 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001789 PyObject *errorHandler = NULL;
1790 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001791
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792 /* Escaped strings will always be longer than the resulting
1793 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001794 length after conversion to the true value.
1795 (but if the error callback returns a long replacement string
1796 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 v = _PyUnicode_New(size);
1798 if (v == NULL)
1799 goto onError;
1800 if (size == 0)
1801 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001802
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001804 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001805
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806 while (s < end) {
1807 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001808 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810
1811 /* Non-escape characters are interpreted as Unicode ordinals */
1812 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001813 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 continue;
1815 }
1816
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001817 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818 /* \ - Escapes */
1819 s++;
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001820 c = *s++;
1821 if (s > end)
1822 c = '\0'; /* Invalid after \ */
1823 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824
1825 /* \x escapes */
1826 case '\n': break;
1827 case '\\': *p++ = '\\'; break;
1828 case '\'': *p++ = '\''; break;
1829 case '\"': *p++ = '\"'; break;
1830 case 'b': *p++ = '\b'; break;
1831 case 'f': *p++ = '\014'; break; /* FF */
1832 case 't': *p++ = '\t'; break;
1833 case 'n': *p++ = '\n'; break;
1834 case 'r': *p++ = '\r'; break;
1835 case 'v': *p++ = '\013'; break; /* VT */
1836 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1837
1838 /* \OOO (octal) escapes */
1839 case '0': case '1': case '2': case '3':
1840 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001841 x = s[-1] - '0';
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001842 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001843 x = (x<<3) + *s++ - '0';
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001844 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001845 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001847 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 break;
1849
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850 /* hex escapes */
1851 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001853 digits = 2;
1854 message = "truncated \\xXX escape";
1855 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856
Fredrik Lundhccc74732001-02-18 22:13:49 +00001857 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001859 digits = 4;
1860 message = "truncated \\uXXXX escape";
1861 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862
Fredrik Lundhccc74732001-02-18 22:13:49 +00001863 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001864 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001865 digits = 8;
1866 message = "truncated \\UXXXXXXXX escape";
1867 hexescape:
1868 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 outpos = p-PyUnicode_AS_UNICODE(v);
1870 if (s+digits>end) {
1871 endinpos = size;
1872 if (unicode_decode_call_errorhandler(
1873 errors, &errorHandler,
1874 "unicodeescape", "end of string in escape sequence",
1875 starts, size, &startinpos, &endinpos, &exc, &s,
1876 (PyObject **)&v, &outpos, &p))
1877 goto onError;
1878 goto nextByte;
1879 }
1880 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001881 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001882 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 endinpos = (s+i+1)-starts;
1884 if (unicode_decode_call_errorhandler(
1885 errors, &errorHandler,
1886 "unicodeescape", message,
1887 starts, size, &startinpos, &endinpos, &exc, &s,
1888 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001889 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001890 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001891 }
1892 chr = (chr<<4) & ~0xF;
1893 if (c >= '0' && c <= '9')
1894 chr += c - '0';
1895 else if (c >= 'a' && c <= 'f')
1896 chr += 10 + c - 'a';
1897 else
1898 chr += 10 + c - 'A';
1899 }
1900 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001901 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001902 /* _decoding_error will have already written into the
1903 target buffer. */
1904 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001905 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001906 /* when we get here, chr is a 32-bit unicode character */
1907 if (chr <= 0xffff)
1908 /* UCS-2 character */
1909 *p++ = (Py_UNICODE) chr;
1910 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001911 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001912 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001913#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001914 *p++ = chr;
1915#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001916 chr -= 0x10000L;
1917 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001918 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001919#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001920 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001921 endinpos = s-starts;
1922 outpos = p-PyUnicode_AS_UNICODE(v);
1923 if (unicode_decode_call_errorhandler(
1924 errors, &errorHandler,
1925 "unicodeescape", "illegal Unicode character",
1926 starts, size, &startinpos, &endinpos, &exc, &s,
1927 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001928 goto onError;
1929 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001930 break;
1931
1932 /* \N{name} */
1933 case 'N':
1934 message = "malformed \\N character escape";
1935 if (ucnhash_CAPI == NULL) {
1936 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001937 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001938 m = PyImport_ImportModule("unicodedata");
1939 if (m == NULL)
1940 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001941 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001942 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001943 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001944 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001945 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001946 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001947 if (ucnhash_CAPI == NULL)
1948 goto ucnhashError;
1949 }
1950 if (*s == '{') {
1951 const char *start = s+1;
1952 /* look for the closing brace */
1953 while (*s != '}' && s < end)
1954 s++;
1955 if (s > start && s < end && *s == '}') {
1956 /* found a name. look it up in the unicode database */
1957 message = "unknown Unicode character name";
1958 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001959 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001960 goto store;
1961 }
1962 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001963 endinpos = s-starts;
1964 outpos = p-PyUnicode_AS_UNICODE(v);
1965 if (unicode_decode_call_errorhandler(
1966 errors, &errorHandler,
1967 "unicodeescape", message,
1968 starts, size, &startinpos, &endinpos, &exc, &s,
1969 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001970 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001971 break;
1972
1973 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001974 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001975 message = "\\ at end of string";
1976 s--;
1977 endinpos = s-starts;
1978 outpos = p-PyUnicode_AS_UNICODE(v);
1979 if (unicode_decode_call_errorhandler(
1980 errors, &errorHandler,
1981 "unicodeescape", message,
1982 starts, size, &startinpos, &endinpos, &exc, &s,
1983 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001984 goto onError;
1985 }
1986 else {
1987 *p++ = '\\';
1988 *p++ = (unsigned char)s[-1];
1989 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001990 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001992 nextByte:
1993 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001995 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001996 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001997 Py_XDECREF(errorHandler);
1998 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001999 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002000
Fredrik Lundhccc74732001-02-18 22:13:49 +00002001ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002002 PyErr_SetString(
2003 PyExc_UnicodeError,
2004 "\\N escapes not supported (can't load unicodedata module)"
2005 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002006 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002007 Py_XDECREF(errorHandler);
2008 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002009 return NULL;
2010
Fredrik Lundhccc74732001-02-18 22:13:49 +00002011onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002012 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002013 Py_XDECREF(errorHandler);
2014 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 return NULL;
2016}
2017
2018/* Return a Unicode-Escape string version of the Unicode object.
2019
2020 If quotes is true, the string is enclosed in u"" or u'' quotes as
2021 appropriate.
2022
2023*/
2024
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002025Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002026 Py_ssize_t size,
2027 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002028{
2029 /* like wcschr, but doesn't stop at NULL characters */
2030
2031 while (size-- > 0) {
2032 if (*s == ch)
2033 return s;
2034 s++;
2035 }
2036
2037 return NULL;
2038}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002039
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040static
2041PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002042 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 int quotes)
2044{
2045 PyObject *repr;
2046 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002048 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002050 /* Initial allocation is based on the longest-possible unichr
2051 escape.
2052
2053 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2054 unichr, so in this case it's the longest unichr escape. In
2055 narrow (UTF-16) builds this is five chars per source unichr
2056 since there are two unichrs in the surrogate pair, so in narrow
2057 (UTF-16) builds it's not the longest unichr escape.
2058
2059 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2060 so in the narrow (UTF-16) build case it's the longest unichr
2061 escape.
2062 */
2063
2064 repr = PyString_FromStringAndSize(NULL,
2065 2
2066#ifdef Py_UNICODE_WIDE
2067 + 10*size
2068#else
2069 + 6*size
2070#endif
2071 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 if (repr == NULL)
2073 return NULL;
2074
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002075 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076
2077 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002079 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 !findchar(s, size, '"')) ? '"' : '\'';
2081 }
2082 while (size-- > 0) {
2083 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002084
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002085 /* Escape quotes and backslashes */
2086 if ((quotes &&
2087 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088 *p++ = '\\';
2089 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002090 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002091 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002092
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002093#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002094 /* Map 21-bit characters to '\U00xxxxxx' */
2095 else if (ch >= 0x10000) {
2096 *p++ = '\\';
2097 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002098 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2099 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2100 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2101 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2102 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2103 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2104 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002105 *p++ = hexdigit[ch & 0x0000000F];
2106 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002107 }
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002108#else
2109 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002110 else if (ch >= 0xD800 && ch < 0xDC00) {
2111 Py_UNICODE ch2;
2112 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002113
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002114 ch2 = *s++;
2115 size--;
2116 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2117 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2118 *p++ = '\\';
2119 *p++ = 'U';
2120 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2121 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2122 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2123 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2124 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2125 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2126 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2127 *p++ = hexdigit[ucs & 0x0000000F];
2128 continue;
2129 }
2130 /* Fall through: isolated surrogates are copied as-is */
2131 s--;
2132 size++;
2133 }
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002134#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002135
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002137 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138 *p++ = '\\';
2139 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002140 *p++ = hexdigit[(ch >> 12) & 0x000F];
2141 *p++ = hexdigit[(ch >> 8) & 0x000F];
2142 *p++ = hexdigit[(ch >> 4) & 0x000F];
2143 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002145
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002146 /* Map special whitespace to '\t', \n', '\r' */
2147 else if (ch == '\t') {
2148 *p++ = '\\';
2149 *p++ = 't';
2150 }
2151 else if (ch == '\n') {
2152 *p++ = '\\';
2153 *p++ = 'n';
2154 }
2155 else if (ch == '\r') {
2156 *p++ = '\\';
2157 *p++ = 'r';
2158 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002159
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002160 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002161 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002163 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002164 *p++ = hexdigit[(ch >> 4) & 0x000F];
2165 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002166 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002167
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 /* Copy everything else as-is */
2169 else
2170 *p++ = (char) ch;
2171 }
2172 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002173 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174
2175 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002176 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177 return repr;
2178}
2179
2180PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002181 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182{
2183 return unicodeescape_string(s, size, 0);
2184}
2185
2186PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2187{
2188 if (!PyUnicode_Check(unicode)) {
2189 PyErr_BadArgument();
2190 return NULL;
2191 }
2192 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2193 PyUnicode_GET_SIZE(unicode));
2194}
2195
2196/* --- Raw Unicode Escape Codec ------------------------------------------- */
2197
2198PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002199 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200 const char *errors)
2201{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002202 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002203 Py_ssize_t startinpos;
2204 Py_ssize_t endinpos;
2205 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002207 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208 const char *end;
2209 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002210 PyObject *errorHandler = NULL;
2211 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002212
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 /* Escaped strings will always be longer than the resulting
2214 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 length after conversion to the true value. (But decoding error
2216 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 v = _PyUnicode_New(size);
2218 if (v == NULL)
2219 goto onError;
2220 if (size == 0)
2221 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002222 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223 end = s + size;
2224 while (s < end) {
2225 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002226 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002228 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229
2230 /* Non-escape characters are interpreted as Unicode ordinals */
2231 if (*s != '\\') {
2232 *p++ = (unsigned char)*s++;
2233 continue;
2234 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002235 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236
2237 /* \u-escapes are only interpreted iff the number of leading
2238 backslashes if odd */
2239 bs = s;
2240 for (;s < end;) {
2241 if (*s != '\\')
2242 break;
2243 *p++ = (unsigned char)*s++;
2244 }
2245 if (((s - bs) & 1) == 0 ||
2246 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002247 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 continue;
2249 }
2250 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002251 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252 s++;
2253
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002254 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002255 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002256 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002257 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002259 endinpos = s-starts;
2260 if (unicode_decode_call_errorhandler(
2261 errors, &errorHandler,
2262 "rawunicodeescape", "truncated \\uXXXX",
2263 starts, size, &startinpos, &endinpos, &exc, &s,
2264 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 }
2268 x = (x<<4) & ~0xF;
2269 if (c >= '0' && c <= '9')
2270 x += c - '0';
2271 else if (c >= 'a' && c <= 'f')
2272 x += 10 + c - 'a';
2273 else
2274 x += 10 + c - 'A';
2275 }
Amaury Forgeot d'Arca79e0502008-03-24 21:16:28 +00002276 if (x <= 0xffff)
2277 /* UCS-2 character */
2278 *p++ = (Py_UNICODE) x;
2279 else if (x <= 0x10ffff) {
2280 /* UCS-4 character. Either store directly, or as
2281 surrogate pair. */
2282#ifdef Py_UNICODE_WIDE
2283 *p++ = (Py_UNICODE) x;
2284#else
2285 x -= 0x10000L;
2286 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
2287 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
2288#endif
2289 } else {
2290 endinpos = s-starts;
2291 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002292 if (unicode_decode_call_errorhandler(
2293 errors, &errorHandler,
2294 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2295 starts, size, &startinpos, &endinpos, &exc, &s,
2296 (PyObject **)&v, &outpos, &p))
2297 goto onError;
2298 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002299 nextByte:
2300 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002302 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002303 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002304 Py_XDECREF(errorHandler);
2305 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002307
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 onError:
2309 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002310 Py_XDECREF(errorHandler);
2311 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312 return NULL;
2313}
2314
2315PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002316 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002317{
2318 PyObject *repr;
2319 char *p;
2320 char *q;
2321
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002322 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002323
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002324#ifdef Py_UNICODE_WIDE
2325 repr = PyString_FromStringAndSize(NULL, 10 * size);
2326#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002328#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329 if (repr == NULL)
2330 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002331 if (size == 0)
2332 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002333
2334 p = q = PyString_AS_STRING(repr);
2335 while (size-- > 0) {
2336 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002337#ifdef Py_UNICODE_WIDE
2338 /* Map 32-bit characters to '\Uxxxxxxxx' */
2339 if (ch >= 0x10000) {
2340 *p++ = '\\';
2341 *p++ = 'U';
2342 *p++ = hexdigit[(ch >> 28) & 0xf];
2343 *p++ = hexdigit[(ch >> 24) & 0xf];
2344 *p++ = hexdigit[(ch >> 20) & 0xf];
2345 *p++ = hexdigit[(ch >> 16) & 0xf];
2346 *p++ = hexdigit[(ch >> 12) & 0xf];
2347 *p++ = hexdigit[(ch >> 8) & 0xf];
2348 *p++ = hexdigit[(ch >> 4) & 0xf];
2349 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002350 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002351 else
Amaury Forgeot d'Arca79e0502008-03-24 21:16:28 +00002352#else
2353 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2354 if (ch >= 0xD800 && ch < 0xDC00) {
2355 Py_UNICODE ch2;
2356 Py_UCS4 ucs;
2357
2358 ch2 = *s++;
2359 size--;
2360 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2361 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2362 *p++ = '\\';
2363 *p++ = 'U';
2364 *p++ = hexdigit[(ucs >> 28) & 0xf];
2365 *p++ = hexdigit[(ucs >> 24) & 0xf];
2366 *p++ = hexdigit[(ucs >> 20) & 0xf];
2367 *p++ = hexdigit[(ucs >> 16) & 0xf];
2368 *p++ = hexdigit[(ucs >> 12) & 0xf];
2369 *p++ = hexdigit[(ucs >> 8) & 0xf];
2370 *p++ = hexdigit[(ucs >> 4) & 0xf];
2371 *p++ = hexdigit[ucs & 0xf];
2372 continue;
2373 }
2374 /* Fall through: isolated surrogates are copied as-is */
2375 s--;
2376 size++;
2377 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002378#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002379 /* Map 16-bit characters to '\uxxxx' */
2380 if (ch >= 256) {
2381 *p++ = '\\';
2382 *p++ = 'u';
2383 *p++ = hexdigit[(ch >> 12) & 0xf];
2384 *p++ = hexdigit[(ch >> 8) & 0xf];
2385 *p++ = hexdigit[(ch >> 4) & 0xf];
2386 *p++ = hexdigit[ch & 15];
2387 }
2388 /* Copy everything else as-is */
2389 else
2390 *p++ = (char) ch;
2391 }
2392 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002393 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394 return repr;
2395}
2396
2397PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2398{
2399 if (!PyUnicode_Check(unicode)) {
2400 PyErr_BadArgument();
2401 return NULL;
2402 }
2403 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2404 PyUnicode_GET_SIZE(unicode));
2405}
2406
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002407/* --- Unicode Internal Codec ------------------------------------------- */
2408
2409PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002410 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002411 const char *errors)
2412{
2413 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002414 Py_ssize_t startinpos;
2415 Py_ssize_t endinpos;
2416 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002417 PyUnicodeObject *v;
2418 Py_UNICODE *p;
2419 const char *end;
2420 const char *reason;
2421 PyObject *errorHandler = NULL;
2422 PyObject *exc = NULL;
2423
Neal Norwitzd43069c2006-01-08 01:12:10 +00002424#ifdef Py_UNICODE_WIDE
2425 Py_UNICODE unimax = PyUnicode_GetMax();
2426#endif
2427
Armin Rigo4b63c212006-10-04 11:44:06 +00002428 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002429 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2430 if (v == NULL)
2431 goto onError;
2432 if (PyUnicode_GetSize((PyObject *)v) == 0)
2433 return (PyObject *)v;
2434 p = PyUnicode_AS_UNICODE(v);
2435 end = s + size;
2436
2437 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002438 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002439 /* We have to sanity check the raw data, otherwise doom looms for
2440 some malformed UCS-4 data. */
2441 if (
2442 #ifdef Py_UNICODE_WIDE
2443 *p > unimax || *p < 0 ||
2444 #endif
2445 end-s < Py_UNICODE_SIZE
2446 )
2447 {
2448 startinpos = s - starts;
2449 if (end-s < Py_UNICODE_SIZE) {
2450 endinpos = end-starts;
2451 reason = "truncated input";
2452 }
2453 else {
2454 endinpos = s - starts + Py_UNICODE_SIZE;
2455 reason = "illegal code point (> 0x10FFFF)";
2456 }
2457 outpos = p - PyUnicode_AS_UNICODE(v);
2458 if (unicode_decode_call_errorhandler(
2459 errors, &errorHandler,
2460 "unicode_internal", reason,
2461 starts, size, &startinpos, &endinpos, &exc, &s,
2462 (PyObject **)&v, &outpos, &p)) {
2463 goto onError;
2464 }
2465 }
2466 else {
2467 p++;
2468 s += Py_UNICODE_SIZE;
2469 }
2470 }
2471
Martin v. Löwis412fb672006-04-13 06:34:32 +00002472 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002473 goto onError;
2474 Py_XDECREF(errorHandler);
2475 Py_XDECREF(exc);
2476 return (PyObject *)v;
2477
2478 onError:
2479 Py_XDECREF(v);
2480 Py_XDECREF(errorHandler);
2481 Py_XDECREF(exc);
2482 return NULL;
2483}
2484
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485/* --- Latin-1 Codec ------------------------------------------------------ */
2486
2487PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002488 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489 const char *errors)
2490{
2491 PyUnicodeObject *v;
2492 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002493
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002495 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002496 Py_UNICODE r = *(unsigned char*)s;
2497 return PyUnicode_FromUnicode(&r, 1);
2498 }
2499
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 v = _PyUnicode_New(size);
2501 if (v == NULL)
2502 goto onError;
2503 if (size == 0)
2504 return (PyObject *)v;
2505 p = PyUnicode_AS_UNICODE(v);
2506 while (size-- > 0)
2507 *p++ = (unsigned char)*s++;
2508 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002509
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510 onError:
2511 Py_XDECREF(v);
2512 return NULL;
2513}
2514
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002515/* create or adjust a UnicodeEncodeError */
2516static void make_encode_exception(PyObject **exceptionObject,
2517 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002518 const Py_UNICODE *unicode, Py_ssize_t size,
2519 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002520 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002522 if (*exceptionObject == NULL) {
2523 *exceptionObject = PyUnicodeEncodeError_Create(
2524 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 }
2526 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002527 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2528 goto onError;
2529 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2530 goto onError;
2531 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2532 goto onError;
2533 return;
2534 onError:
2535 Py_DECREF(*exceptionObject);
2536 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 }
2538}
2539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002540/* raises a UnicodeEncodeError */
2541static void raise_encode_exception(PyObject **exceptionObject,
2542 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002543 const Py_UNICODE *unicode, Py_ssize_t size,
2544 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002545 const char *reason)
2546{
2547 make_encode_exception(exceptionObject,
2548 encoding, unicode, size, startpos, endpos, reason);
2549 if (*exceptionObject != NULL)
2550 PyCodec_StrictErrors(*exceptionObject);
2551}
2552
2553/* error handling callback helper:
2554 build arguments, call the callback and check the arguments,
2555 put the result into newpos and return the replacement string, which
2556 has to be freed by the caller */
2557static PyObject *unicode_encode_call_errorhandler(const char *errors,
2558 PyObject **errorHandler,
2559 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002560 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2561 Py_ssize_t startpos, Py_ssize_t endpos,
2562 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002564 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002565
2566 PyObject *restuple;
2567 PyObject *resunicode;
2568
2569 if (*errorHandler == NULL) {
2570 *errorHandler = PyCodec_LookupError(errors);
2571 if (*errorHandler == NULL)
2572 return NULL;
2573 }
2574
2575 make_encode_exception(exceptionObject,
2576 encoding, unicode, size, startpos, endpos, reason);
2577 if (*exceptionObject == NULL)
2578 return NULL;
2579
2580 restuple = PyObject_CallFunctionObjArgs(
2581 *errorHandler, *exceptionObject, NULL);
2582 if (restuple == NULL)
2583 return NULL;
2584 if (!PyTuple_Check(restuple)) {
2585 PyErr_Format(PyExc_TypeError, &argparse[4]);
2586 Py_DECREF(restuple);
2587 return NULL;
2588 }
2589 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2590 &resunicode, newpos)) {
2591 Py_DECREF(restuple);
2592 return NULL;
2593 }
2594 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002595 *newpos = size+*newpos;
2596 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002597 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002598 Py_DECREF(restuple);
2599 return NULL;
2600 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002601 Py_INCREF(resunicode);
2602 Py_DECREF(restuple);
2603 return resunicode;
2604}
2605
2606static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002607 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002608 const char *errors,
2609 int limit)
2610{
2611 /* output object */
2612 PyObject *res;
2613 /* pointers to the beginning and end+1 of input */
2614 const Py_UNICODE *startp = p;
2615 const Py_UNICODE *endp = p + size;
2616 /* pointer to the beginning of the unencodable characters */
2617 /* const Py_UNICODE *badp = NULL; */
2618 /* pointer into the output */
2619 char *str;
2620 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002621 Py_ssize_t respos = 0;
2622 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002623 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2624 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002625 PyObject *errorHandler = NULL;
2626 PyObject *exc = NULL;
2627 /* the following variable is used for caching string comparisons
2628 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2629 int known_errorHandler = -1;
2630
2631 /* allocate enough for a simple encoding without
2632 replacements, if we need more, we'll resize */
2633 res = PyString_FromStringAndSize(NULL, size);
2634 if (res == NULL)
2635 goto onError;
2636 if (size == 0)
2637 return res;
2638 str = PyString_AS_STRING(res);
2639 ressize = size;
2640
2641 while (p<endp) {
2642 Py_UNICODE c = *p;
2643
2644 /* can we encode this? */
2645 if (c<limit) {
2646 /* no overflow check, because we know that the space is enough */
2647 *str++ = (char)c;
2648 ++p;
2649 }
2650 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002651 Py_ssize_t unicodepos = p-startp;
2652 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002653 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002654 Py_ssize_t repsize;
2655 Py_ssize_t newpos;
2656 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002657 Py_UNICODE *uni2;
2658 /* startpos for collecting unencodable chars */
2659 const Py_UNICODE *collstart = p;
2660 const Py_UNICODE *collend = p;
2661 /* find all unecodable characters */
2662 while ((collend < endp) && ((*collend)>=limit))
2663 ++collend;
2664 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2665 if (known_errorHandler==-1) {
2666 if ((errors==NULL) || (!strcmp(errors, "strict")))
2667 known_errorHandler = 1;
2668 else if (!strcmp(errors, "replace"))
2669 known_errorHandler = 2;
2670 else if (!strcmp(errors, "ignore"))
2671 known_errorHandler = 3;
2672 else if (!strcmp(errors, "xmlcharrefreplace"))
2673 known_errorHandler = 4;
2674 else
2675 known_errorHandler = 0;
2676 }
2677 switch (known_errorHandler) {
2678 case 1: /* strict */
2679 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2680 goto onError;
2681 case 2: /* replace */
2682 while (collstart++<collend)
2683 *str++ = '?'; /* fall through */
2684 case 3: /* ignore */
2685 p = collend;
2686 break;
2687 case 4: /* xmlcharrefreplace */
2688 respos = str-PyString_AS_STRING(res);
2689 /* determine replacement size (temporarily (mis)uses p) */
2690 for (p = collstart, repsize = 0; p < collend; ++p) {
2691 if (*p<10)
2692 repsize += 2+1+1;
2693 else if (*p<100)
2694 repsize += 2+2+1;
2695 else if (*p<1000)
2696 repsize += 2+3+1;
2697 else if (*p<10000)
2698 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002699#ifndef Py_UNICODE_WIDE
2700 else
2701 repsize += 2+5+1;
2702#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002703 else if (*p<100000)
2704 repsize += 2+5+1;
2705 else if (*p<1000000)
2706 repsize += 2+6+1;
2707 else
2708 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002709#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002710 }
2711 requiredsize = respos+repsize+(endp-collend);
2712 if (requiredsize > ressize) {
2713 if (requiredsize<2*ressize)
2714 requiredsize = 2*ressize;
2715 if (_PyString_Resize(&res, requiredsize))
2716 goto onError;
2717 str = PyString_AS_STRING(res) + respos;
2718 ressize = requiredsize;
2719 }
2720 /* generate replacement (temporarily (mis)uses p) */
2721 for (p = collstart; p < collend; ++p) {
2722 str += sprintf(str, "&#%d;", (int)*p);
2723 }
2724 p = collend;
2725 break;
2726 default:
2727 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2728 encoding, reason, startp, size, &exc,
2729 collstart-startp, collend-startp, &newpos);
2730 if (repunicode == NULL)
2731 goto onError;
2732 /* need more space? (at least enough for what we
2733 have+the replacement+the rest of the string, so
2734 we won't have to check space for encodable characters) */
2735 respos = str-PyString_AS_STRING(res);
2736 repsize = PyUnicode_GET_SIZE(repunicode);
2737 requiredsize = respos+repsize+(endp-collend);
2738 if (requiredsize > ressize) {
2739 if (requiredsize<2*ressize)
2740 requiredsize = 2*ressize;
2741 if (_PyString_Resize(&res, requiredsize)) {
2742 Py_DECREF(repunicode);
2743 goto onError;
2744 }
2745 str = PyString_AS_STRING(res) + respos;
2746 ressize = requiredsize;
2747 }
2748 /* check if there is anything unencodable in the replacement
2749 and copy it to the output */
2750 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2751 c = *uni2;
2752 if (c >= limit) {
2753 raise_encode_exception(&exc, encoding, startp, size,
2754 unicodepos, unicodepos+1, reason);
2755 Py_DECREF(repunicode);
2756 goto onError;
2757 }
2758 *str = (char)c;
2759 }
2760 p = startp + newpos;
2761 Py_DECREF(repunicode);
2762 }
2763 }
2764 }
2765 /* Resize if we allocated to much */
2766 respos = str-PyString_AS_STRING(res);
2767 if (respos<ressize)
2768 /* If this falls res will be NULL */
2769 _PyString_Resize(&res, respos);
2770 Py_XDECREF(errorHandler);
2771 Py_XDECREF(exc);
2772 return res;
2773
2774 onError:
2775 Py_XDECREF(res);
2776 Py_XDECREF(errorHandler);
2777 Py_XDECREF(exc);
2778 return NULL;
2779}
2780
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002782 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 const char *errors)
2784{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786}
2787
2788PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2789{
2790 if (!PyUnicode_Check(unicode)) {
2791 PyErr_BadArgument();
2792 return NULL;
2793 }
2794 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2795 PyUnicode_GET_SIZE(unicode),
2796 NULL);
2797}
2798
2799/* --- 7-bit ASCII Codec -------------------------------------------------- */
2800
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002802 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 const char *errors)
2804{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002805 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806 PyUnicodeObject *v;
2807 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002808 Py_ssize_t startinpos;
2809 Py_ssize_t endinpos;
2810 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002811 const char *e;
2812 PyObject *errorHandler = NULL;
2813 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002814
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002816 if (size == 1 && *(unsigned char*)s < 128) {
2817 Py_UNICODE r = *(unsigned char*)s;
2818 return PyUnicode_FromUnicode(&r, 1);
2819 }
Tim Petersced69f82003-09-16 20:30:58 +00002820
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 v = _PyUnicode_New(size);
2822 if (v == NULL)
2823 goto onError;
2824 if (size == 0)
2825 return (PyObject *)v;
2826 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 e = s + size;
2828 while (s < e) {
2829 register unsigned char c = (unsigned char)*s;
2830 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002832 ++s;
2833 }
2834 else {
2835 startinpos = s-starts;
2836 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002837 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 if (unicode_decode_call_errorhandler(
2839 errors, &errorHandler,
2840 "ascii", "ordinal not in range(128)",
2841 starts, size, &startinpos, &endinpos, &exc, &s,
2842 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002846 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002847 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002848 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849 Py_XDECREF(errorHandler);
2850 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002852
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 onError:
2854 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002855 Py_XDECREF(errorHandler);
2856 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 return NULL;
2858}
2859
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002861 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862 const char *errors)
2863{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002864 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865}
2866
2867PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2868{
2869 if (!PyUnicode_Check(unicode)) {
2870 PyErr_BadArgument();
2871 return NULL;
2872 }
2873 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2874 PyUnicode_GET_SIZE(unicode),
2875 NULL);
2876}
2877
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002878#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002879
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002880/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002881
Martin v. Löwisd8251432006-06-14 05:21:04 +00002882#if SIZEOF_INT < SIZEOF_SSIZE_T
2883#define NEED_RETRY
2884#endif
2885
2886/* XXX This code is limited to "true" double-byte encodings, as
2887 a) it assumes an incomplete character consists of a single byte, and
2888 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2889 encodings, see IsDBCSLeadByteEx documentation. */
2890
2891static int is_dbcs_lead_byte(const char *s, int offset)
2892{
2893 const char *curr = s + offset;
2894
2895 if (IsDBCSLeadByte(*curr)) {
2896 const char *prev = CharPrev(s, curr);
2897 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2898 }
2899 return 0;
2900}
2901
2902/*
2903 * Decode MBCS string into unicode object. If 'final' is set, converts
2904 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2905 */
2906static int decode_mbcs(PyUnicodeObject **v,
2907 const char *s, /* MBCS string */
2908 int size, /* sizeof MBCS string */
2909 int final)
2910{
2911 Py_UNICODE *p;
2912 Py_ssize_t n = 0;
2913 int usize = 0;
2914
2915 assert(size >= 0);
2916
2917 /* Skip trailing lead-byte unless 'final' is set */
2918 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2919 --size;
2920
2921 /* First get the size of the result */
2922 if (size > 0) {
2923 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2924 if (usize == 0) {
2925 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2926 return -1;
2927 }
2928 }
2929
2930 if (*v == NULL) {
2931 /* Create unicode object */
2932 *v = _PyUnicode_New(usize);
2933 if (*v == NULL)
2934 return -1;
2935 }
2936 else {
2937 /* Extend unicode object */
2938 n = PyUnicode_GET_SIZE(*v);
2939 if (_PyUnicode_Resize(v, n + usize) < 0)
2940 return -1;
2941 }
2942
2943 /* Do the conversion */
2944 if (size > 0) {
2945 p = PyUnicode_AS_UNICODE(*v) + n;
2946 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2947 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2948 return -1;
2949 }
2950 }
2951
2952 return size;
2953}
2954
2955PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2956 Py_ssize_t size,
2957 const char *errors,
2958 Py_ssize_t *consumed)
2959{
2960 PyUnicodeObject *v = NULL;
2961 int done;
2962
2963 if (consumed)
2964 *consumed = 0;
2965
2966#ifdef NEED_RETRY
2967 retry:
2968 if (size > INT_MAX)
2969 done = decode_mbcs(&v, s, INT_MAX, 0);
2970 else
2971#endif
2972 done = decode_mbcs(&v, s, (int)size, !consumed);
2973
2974 if (done < 0) {
2975 Py_XDECREF(v);
2976 return NULL;
2977 }
2978
2979 if (consumed)
2980 *consumed += done;
2981
2982#ifdef NEED_RETRY
2983 if (size > INT_MAX) {
2984 s += done;
2985 size -= done;
2986 goto retry;
2987 }
2988#endif
2989
2990 return (PyObject *)v;
2991}
2992
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002993PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002994 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002995 const char *errors)
2996{
Martin v. Löwisd8251432006-06-14 05:21:04 +00002997 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2998}
2999
3000/*
3001 * Convert unicode into string object (MBCS).
3002 * Returns 0 if succeed, -1 otherwise.
3003 */
3004static int encode_mbcs(PyObject **repr,
3005 const Py_UNICODE *p, /* unicode */
3006 int size) /* size of unicode */
3007{
3008 int mbcssize = 0;
3009 Py_ssize_t n = 0;
3010
3011 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003012
3013 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003014 if (size > 0) {
3015 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3016 if (mbcssize == 0) {
3017 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3018 return -1;
3019 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003020 }
3021
Martin v. Löwisd8251432006-06-14 05:21:04 +00003022 if (*repr == NULL) {
3023 /* Create string object */
3024 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3025 if (*repr == NULL)
3026 return -1;
3027 }
3028 else {
3029 /* Extend string object */
3030 n = PyString_Size(*repr);
3031 if (_PyString_Resize(repr, n + mbcssize) < 0)
3032 return -1;
3033 }
3034
3035 /* Do the conversion */
3036 if (size > 0) {
3037 char *s = PyString_AS_STRING(*repr) + n;
3038 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3039 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3040 return -1;
3041 }
3042 }
3043
3044 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003045}
3046
3047PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003048 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003049 const char *errors)
3050{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003051 PyObject *repr = NULL;
3052 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003053
Martin v. Löwisd8251432006-06-14 05:21:04 +00003054#ifdef NEED_RETRY
3055 retry:
3056 if (size > INT_MAX)
3057 ret = encode_mbcs(&repr, p, INT_MAX);
3058 else
3059#endif
3060 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003061
Martin v. Löwisd8251432006-06-14 05:21:04 +00003062 if (ret < 0) {
3063 Py_XDECREF(repr);
3064 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003065 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003066
3067#ifdef NEED_RETRY
3068 if (size > INT_MAX) {
3069 p += INT_MAX;
3070 size -= INT_MAX;
3071 goto retry;
3072 }
3073#endif
3074
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003075 return repr;
3076}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003077
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003078PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3079{
3080 if (!PyUnicode_Check(unicode)) {
3081 PyErr_BadArgument();
3082 return NULL;
3083 }
3084 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3085 PyUnicode_GET_SIZE(unicode),
3086 NULL);
3087}
3088
Martin v. Löwisd8251432006-06-14 05:21:04 +00003089#undef NEED_RETRY
3090
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003091#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003092
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093/* --- Character Mapping Codec -------------------------------------------- */
3094
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003096 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097 PyObject *mapping,
3098 const char *errors)
3099{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003100 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003101 Py_ssize_t startinpos;
3102 Py_ssize_t endinpos;
3103 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003104 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 PyUnicodeObject *v;
3106 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003107 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108 PyObject *errorHandler = NULL;
3109 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003110 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003111 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003112
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 /* Default to Latin-1 */
3114 if (mapping == NULL)
3115 return PyUnicode_DecodeLatin1(s, size, errors);
3116
3117 v = _PyUnicode_New(size);
3118 if (v == NULL)
3119 goto onError;
3120 if (size == 0)
3121 return (PyObject *)v;
3122 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003123 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003124 if (PyUnicode_CheckExact(mapping)) {
3125 mapstring = PyUnicode_AS_UNICODE(mapping);
3126 maplen = PyUnicode_GET_SIZE(mapping);
3127 while (s < e) {
3128 unsigned char ch = *s;
3129 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003131 if (ch < maplen)
3132 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003134 if (x == 0xfffe) {
3135 /* undefined mapping */
3136 outpos = p-PyUnicode_AS_UNICODE(v);
3137 startinpos = s-starts;
3138 endinpos = startinpos+1;
3139 if (unicode_decode_call_errorhandler(
3140 errors, &errorHandler,
3141 "charmap", "character maps to <undefined>",
3142 starts, size, &startinpos, &endinpos, &exc, &s,
3143 (PyObject **)&v, &outpos, &p)) {
3144 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003145 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003146 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003147 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003148 *p++ = x;
3149 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003151 }
3152 else {
3153 while (s < e) {
3154 unsigned char ch = *s;
3155 PyObject *w, *x;
3156
3157 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3158 w = PyInt_FromLong((long)ch);
3159 if (w == NULL)
3160 goto onError;
3161 x = PyObject_GetItem(mapping, w);
3162 Py_DECREF(w);
3163 if (x == NULL) {
3164 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3165 /* No mapping found means: mapping is undefined. */
3166 PyErr_Clear();
3167 x = Py_None;
3168 Py_INCREF(x);
3169 } else
3170 goto onError;
3171 }
3172
3173 /* Apply mapping */
3174 if (PyInt_Check(x)) {
3175 long value = PyInt_AS_LONG(x);
3176 if (value < 0 || value > 65535) {
3177 PyErr_SetString(PyExc_TypeError,
3178 "character mapping must be in range(65536)");
3179 Py_DECREF(x);
3180 goto onError;
3181 }
3182 *p++ = (Py_UNICODE)value;
3183 }
3184 else if (x == Py_None) {
3185 /* undefined mapping */
3186 outpos = p-PyUnicode_AS_UNICODE(v);
3187 startinpos = s-starts;
3188 endinpos = startinpos+1;
3189 if (unicode_decode_call_errorhandler(
3190 errors, &errorHandler,
3191 "charmap", "character maps to <undefined>",
3192 starts, size, &startinpos, &endinpos, &exc, &s,
3193 (PyObject **)&v, &outpos, &p)) {
3194 Py_DECREF(x);
3195 goto onError;
3196 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003197 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003198 continue;
3199 }
3200 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003201 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003202
3203 if (targetsize == 1)
3204 /* 1-1 mapping */
3205 *p++ = *PyUnicode_AS_UNICODE(x);
3206
3207 else if (targetsize > 1) {
3208 /* 1-n mapping */
3209 if (targetsize > extrachars) {
3210 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003211 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3212 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003213 (targetsize << 2);
3214 extrachars += needed;
Armin Rigo4b63c212006-10-04 11:44:06 +00003215 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003216 if (_PyUnicode_Resize(&v,
3217 PyUnicode_GET_SIZE(v) + needed) < 0) {
3218 Py_DECREF(x);
3219 goto onError;
3220 }
3221 p = PyUnicode_AS_UNICODE(v) + oldpos;
3222 }
3223 Py_UNICODE_COPY(p,
3224 PyUnicode_AS_UNICODE(x),
3225 targetsize);
3226 p += targetsize;
3227 extrachars -= targetsize;
3228 }
3229 /* 1-0 mapping: skip the character */
3230 }
3231 else {
3232 /* wrong return value */
3233 PyErr_SetString(PyExc_TypeError,
3234 "character mapping must return integer, None or unicode");
3235 Py_DECREF(x);
3236 goto onError;
3237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003239 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 }
3242 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003243 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003245 Py_XDECREF(errorHandler);
3246 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003248
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003250 Py_XDECREF(errorHandler);
3251 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252 Py_XDECREF(v);
3253 return NULL;
3254}
3255
Martin v. Löwis3f767792006-06-04 19:36:28 +00003256/* Charmap encoding: the lookup table */
3257
3258struct encoding_map{
3259 PyObject_HEAD
3260 unsigned char level1[32];
3261 int count2, count3;
3262 unsigned char level23[1];
3263};
3264
3265static PyObject*
3266encoding_map_size(PyObject *obj, PyObject* args)
3267{
3268 struct encoding_map *map = (struct encoding_map*)obj;
3269 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3270 128*map->count3);
3271}
3272
3273static PyMethodDef encoding_map_methods[] = {
3274 {"size", encoding_map_size, METH_NOARGS,
3275 PyDoc_STR("Return the size (in bytes) of this object") },
3276 { 0 }
3277};
3278
3279static void
3280encoding_map_dealloc(PyObject* o)
3281{
3282 PyObject_FREE(o);
3283}
3284
3285static PyTypeObject EncodingMapType = {
3286 PyObject_HEAD_INIT(NULL)
3287 0, /*ob_size*/
3288 "EncodingMap", /*tp_name*/
3289 sizeof(struct encoding_map), /*tp_basicsize*/
3290 0, /*tp_itemsize*/
3291 /* methods */
3292 encoding_map_dealloc, /*tp_dealloc*/
3293 0, /*tp_print*/
3294 0, /*tp_getattr*/
3295 0, /*tp_setattr*/
3296 0, /*tp_compare*/
3297 0, /*tp_repr*/
3298 0, /*tp_as_number*/
3299 0, /*tp_as_sequence*/
3300 0, /*tp_as_mapping*/
3301 0, /*tp_hash*/
3302 0, /*tp_call*/
3303 0, /*tp_str*/
3304 0, /*tp_getattro*/
3305 0, /*tp_setattro*/
3306 0, /*tp_as_buffer*/
3307 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3308 0, /*tp_doc*/
3309 0, /*tp_traverse*/
3310 0, /*tp_clear*/
3311 0, /*tp_richcompare*/
3312 0, /*tp_weaklistoffset*/
3313 0, /*tp_iter*/
3314 0, /*tp_iternext*/
3315 encoding_map_methods, /*tp_methods*/
3316 0, /*tp_members*/
3317 0, /*tp_getset*/
3318 0, /*tp_base*/
3319 0, /*tp_dict*/
3320 0, /*tp_descr_get*/
3321 0, /*tp_descr_set*/
3322 0, /*tp_dictoffset*/
3323 0, /*tp_init*/
3324 0, /*tp_alloc*/
3325 0, /*tp_new*/
3326 0, /*tp_free*/
3327 0, /*tp_is_gc*/
3328};
3329
3330PyObject*
3331PyUnicode_BuildEncodingMap(PyObject* string)
3332{
3333 Py_UNICODE *decode;
3334 PyObject *result;
3335 struct encoding_map *mresult;
3336 int i;
3337 int need_dict = 0;
3338 unsigned char level1[32];
3339 unsigned char level2[512];
3340 unsigned char *mlevel1, *mlevel2, *mlevel3;
3341 int count2 = 0, count3 = 0;
3342
3343 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3344 PyErr_BadArgument();
3345 return NULL;
3346 }
3347 decode = PyUnicode_AS_UNICODE(string);
3348 memset(level1, 0xFF, sizeof level1);
3349 memset(level2, 0xFF, sizeof level2);
3350
3351 /* If there isn't a one-to-one mapping of NULL to \0,
3352 or if there are non-BMP characters, we need to use
3353 a mapping dictionary. */
3354 if (decode[0] != 0)
3355 need_dict = 1;
3356 for (i = 1; i < 256; i++) {
3357 int l1, l2;
3358 if (decode[i] == 0
3359 #ifdef Py_UNICODE_WIDE
3360 || decode[i] > 0xFFFF
3361 #endif
3362 ) {
3363 need_dict = 1;
3364 break;
3365 }
3366 if (decode[i] == 0xFFFE)
3367 /* unmapped character */
3368 continue;
3369 l1 = decode[i] >> 11;
3370 l2 = decode[i] >> 7;
3371 if (level1[l1] == 0xFF)
3372 level1[l1] = count2++;
3373 if (level2[l2] == 0xFF)
3374 level2[l2] = count3++;
3375 }
3376
3377 if (count2 >= 0xFF || count3 >= 0xFF)
3378 need_dict = 1;
3379
3380 if (need_dict) {
3381 PyObject *result = PyDict_New();
3382 PyObject *key, *value;
3383 if (!result)
3384 return NULL;
3385 for (i = 0; i < 256; i++) {
3386 key = value = NULL;
3387 key = PyInt_FromLong(decode[i]);
3388 value = PyInt_FromLong(i);
3389 if (!key || !value)
3390 goto failed1;
3391 if (PyDict_SetItem(result, key, value) == -1)
3392 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003393 Py_DECREF(key);
3394 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003395 }
3396 return result;
3397 failed1:
3398 Py_XDECREF(key);
3399 Py_XDECREF(value);
3400 Py_DECREF(result);
3401 return NULL;
3402 }
3403
3404 /* Create a three-level trie */
3405 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3406 16*count2 + 128*count3 - 1);
3407 if (!result)
3408 return PyErr_NoMemory();
3409 PyObject_Init(result, &EncodingMapType);
3410 mresult = (struct encoding_map*)result;
3411 mresult->count2 = count2;
3412 mresult->count3 = count3;
3413 mlevel1 = mresult->level1;
3414 mlevel2 = mresult->level23;
3415 mlevel3 = mresult->level23 + 16*count2;
3416 memcpy(mlevel1, level1, 32);
3417 memset(mlevel2, 0xFF, 16*count2);
3418 memset(mlevel3, 0, 128*count3);
3419 count3 = 0;
3420 for (i = 1; i < 256; i++) {
3421 int o1, o2, o3, i2, i3;
3422 if (decode[i] == 0xFFFE)
3423 /* unmapped character */
3424 continue;
3425 o1 = decode[i]>>11;
3426 o2 = (decode[i]>>7) & 0xF;
3427 i2 = 16*mlevel1[o1] + o2;
3428 if (mlevel2[i2] == 0xFF)
3429 mlevel2[i2] = count3++;
3430 o3 = decode[i] & 0x7F;
3431 i3 = 128*mlevel2[i2] + o3;
3432 mlevel3[i3] = i;
3433 }
3434 return result;
3435}
3436
3437static int
3438encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3439{
3440 struct encoding_map *map = (struct encoding_map*)mapping;
3441 int l1 = c>>11;
3442 int l2 = (c>>7) & 0xF;
3443 int l3 = c & 0x7F;
3444 int i;
3445
3446#ifdef Py_UNICODE_WIDE
3447 if (c > 0xFFFF) {
3448 return -1;
3449 }
3450#endif
3451 if (c == 0)
3452 return 0;
3453 /* level 1*/
3454 i = map->level1[l1];
3455 if (i == 0xFF) {
3456 return -1;
3457 }
3458 /* level 2*/
3459 i = map->level23[16*i+l2];
3460 if (i == 0xFF) {
3461 return -1;
3462 }
3463 /* level 3 */
3464 i = map->level23[16*map->count2 + 128*i + l3];
3465 if (i == 0) {
3466 return -1;
3467 }
3468 return i;
3469}
3470
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471/* Lookup the character ch in the mapping. If the character
3472 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003473 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003474static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 PyObject *w = PyInt_FromLong((long)c);
3477 PyObject *x;
3478
3479 if (w == NULL)
3480 return NULL;
3481 x = PyObject_GetItem(mapping, w);
3482 Py_DECREF(w);
3483 if (x == NULL) {
3484 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3485 /* No mapping found means: mapping is undefined. */
3486 PyErr_Clear();
3487 x = Py_None;
3488 Py_INCREF(x);
3489 return x;
3490 } else
3491 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003493 else if (x == Py_None)
3494 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495 else if (PyInt_Check(x)) {
3496 long value = PyInt_AS_LONG(x);
3497 if (value < 0 || value > 255) {
3498 PyErr_SetString(PyExc_TypeError,
3499 "character mapping must be in range(256)");
3500 Py_DECREF(x);
3501 return NULL;
3502 }
3503 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 else if (PyString_Check(x))
3506 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508 /* wrong return value */
3509 PyErr_SetString(PyExc_TypeError,
3510 "character mapping must return integer, None or str");
3511 Py_DECREF(x);
3512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 }
3514}
3515
Martin v. Löwis3f767792006-06-04 19:36:28 +00003516static int
3517charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3518{
3519 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3520 /* exponentially overallocate to minimize reallocations */
3521 if (requiredsize < 2*outsize)
3522 requiredsize = 2*outsize;
3523 if (_PyString_Resize(outobj, requiredsize)) {
3524 return 0;
3525 }
3526 return 1;
3527}
3528
3529typedef enum charmapencode_result {
3530 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3531}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532/* lookup the character, put the result in the output string and adjust
3533 various state variables. Reallocate the output string if not enough
3534 space is available. Return a new reference to the object that
3535 was put in the output buffer, or Py_None, if the mapping was undefined
3536 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003537 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003539charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003540 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003542 PyObject *rep;
3543 char *outstart;
3544 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545
Martin v. Löwis3f767792006-06-04 19:36:28 +00003546 if (mapping->ob_type == &EncodingMapType) {
3547 int res = encoding_map_lookup(c, mapping);
3548 Py_ssize_t requiredsize = *outpos+1;
3549 if (res == -1)
3550 return enc_FAILED;
3551 if (outsize<requiredsize)
3552 if (!charmapencode_resize(outobj, outpos, requiredsize))
3553 return enc_EXCEPTION;
3554 outstart = PyString_AS_STRING(*outobj);
3555 outstart[(*outpos)++] = (char)res;
3556 return enc_SUCCESS;
3557 }
3558
3559 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003561 return enc_EXCEPTION;
3562 else if (rep==Py_None) {
3563 Py_DECREF(rep);
3564 return enc_FAILED;
3565 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003567 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003568 if (outsize<requiredsize)
3569 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003571 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003573 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3575 }
3576 else {
3577 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003578 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3579 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003580 if (outsize<requiredsize)
3581 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003583 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003585 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 memcpy(outstart + *outpos, repchars, repsize);
3587 *outpos += repsize;
3588 }
3589 }
Georg Brandl9f167602006-06-04 21:46:16 +00003590 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003591 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592}
3593
3594/* handle an error in PyUnicode_EncodeCharmap
3595 Return 0 on success, -1 on error */
3596static
3597int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003598 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003600 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003601 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602{
3603 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003604 Py_ssize_t repsize;
3605 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606 Py_UNICODE *uni2;
3607 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003608 Py_ssize_t collstartpos = *inpos;
3609 Py_ssize_t collendpos = *inpos+1;
3610 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 char *encoding = "charmap";
3612 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003613 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 /* find all unencodable characters */
3616 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003617 PyObject *rep;
3618 if (mapping->ob_type == &EncodingMapType) {
3619 int res = encoding_map_lookup(p[collendpos], mapping);
3620 if (res != -1)
3621 break;
3622 ++collendpos;
3623 continue;
3624 }
3625
3626 rep = charmapencode_lookup(p[collendpos], mapping);
3627 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003629 else if (rep!=Py_None) {
3630 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 break;
3632 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003633 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 ++collendpos;
3635 }
3636 /* cache callback name lookup
3637 * (if not done yet, i.e. it's the first error) */
3638 if (*known_errorHandler==-1) {
3639 if ((errors==NULL) || (!strcmp(errors, "strict")))
3640 *known_errorHandler = 1;
3641 else if (!strcmp(errors, "replace"))
3642 *known_errorHandler = 2;
3643 else if (!strcmp(errors, "ignore"))
3644 *known_errorHandler = 3;
3645 else if (!strcmp(errors, "xmlcharrefreplace"))
3646 *known_errorHandler = 4;
3647 else
3648 *known_errorHandler = 0;
3649 }
3650 switch (*known_errorHandler) {
3651 case 1: /* strict */
3652 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3653 return -1;
3654 case 2: /* replace */
3655 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3656 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003657 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 return -1;
3659 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003660 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3662 return -1;
3663 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 }
3665 /* fall through */
3666 case 3: /* ignore */
3667 *inpos = collendpos;
3668 break;
3669 case 4: /* xmlcharrefreplace */
3670 /* generate replacement (temporarily (mis)uses p) */
3671 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3672 char buffer[2+29+1+1];
3673 char *cp;
3674 sprintf(buffer, "&#%d;", (int)p[collpos]);
3675 for (cp = buffer; *cp; ++cp) {
3676 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003677 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003679 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3681 return -1;
3682 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683 }
3684 }
3685 *inpos = collendpos;
3686 break;
3687 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003688 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 encoding, reason, p, size, exceptionObject,
3690 collstartpos, collendpos, &newpos);
3691 if (repunicode == NULL)
3692 return -1;
3693 /* generate replacement */
3694 repsize = PyUnicode_GET_SIZE(repunicode);
3695 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3696 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003697 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003698 return -1;
3699 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003700 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3703 return -1;
3704 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 }
3706 *inpos = newpos;
3707 Py_DECREF(repunicode);
3708 }
3709 return 0;
3710}
3711
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003713 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714 PyObject *mapping,
3715 const char *errors)
3716{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003717 /* output object */
3718 PyObject *res = NULL;
3719 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003720 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003721 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003722 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003723 PyObject *errorHandler = NULL;
3724 PyObject *exc = NULL;
3725 /* the following variable is used for caching string comparisons
3726 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3727 * 3=ignore, 4=xmlcharrefreplace */
3728 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729
3730 /* Default to Latin-1 */
3731 if (mapping == NULL)
3732 return PyUnicode_EncodeLatin1(p, size, errors);
3733
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003734 /* allocate enough for a simple encoding without
3735 replacements, if we need more, we'll resize */
3736 res = PyString_FromStringAndSize(NULL, size);
3737 if (res == NULL)
3738 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003739 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003740 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003742 while (inpos<size) {
3743 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003744 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3745 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003747 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 if (charmap_encoding_error(p, size, &inpos, mapping,
3749 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003750 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003751 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003752 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003753 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003755 else
3756 /* done with this character => adjust input position */
3757 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 /* Resize if we allocated to much */
3761 if (respos<PyString_GET_SIZE(res)) {
3762 if (_PyString_Resize(&res, respos))
3763 goto onError;
3764 }
3765 Py_XDECREF(exc);
3766 Py_XDECREF(errorHandler);
3767 return res;
3768
3769 onError:
3770 Py_XDECREF(res);
3771 Py_XDECREF(exc);
3772 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 return NULL;
3774}
3775
3776PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3777 PyObject *mapping)
3778{
3779 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3780 PyErr_BadArgument();
3781 return NULL;
3782 }
3783 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3784 PyUnicode_GET_SIZE(unicode),
3785 mapping,
3786 NULL);
3787}
3788
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003789/* create or adjust a UnicodeTranslateError */
3790static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003791 const Py_UNICODE *unicode, Py_ssize_t size,
3792 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003793 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795 if (*exceptionObject == NULL) {
3796 *exceptionObject = PyUnicodeTranslateError_Create(
3797 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798 }
3799 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003800 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3801 goto onError;
3802 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3803 goto onError;
3804 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3805 goto onError;
3806 return;
3807 onError:
3808 Py_DECREF(*exceptionObject);
3809 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 }
3811}
3812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003813/* raises a UnicodeTranslateError */
3814static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003815 const Py_UNICODE *unicode, Py_ssize_t size,
3816 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003817 const char *reason)
3818{
3819 make_translate_exception(exceptionObject,
3820 unicode, size, startpos, endpos, reason);
3821 if (*exceptionObject != NULL)
3822 PyCodec_StrictErrors(*exceptionObject);
3823}
3824
3825/* error handling callback helper:
3826 build arguments, call the callback and check the arguments,
3827 put the result into newpos and return the replacement string, which
3828 has to be freed by the caller */
3829static PyObject *unicode_translate_call_errorhandler(const char *errors,
3830 PyObject **errorHandler,
3831 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003832 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3833 Py_ssize_t startpos, Py_ssize_t endpos,
3834 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003836 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003837
Martin v. Löwis412fb672006-04-13 06:34:32 +00003838 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003839 PyObject *restuple;
3840 PyObject *resunicode;
3841
3842 if (*errorHandler == NULL) {
3843 *errorHandler = PyCodec_LookupError(errors);
3844 if (*errorHandler == NULL)
3845 return NULL;
3846 }
3847
3848 make_translate_exception(exceptionObject,
3849 unicode, size, startpos, endpos, reason);
3850 if (*exceptionObject == NULL)
3851 return NULL;
3852
3853 restuple = PyObject_CallFunctionObjArgs(
3854 *errorHandler, *exceptionObject, NULL);
3855 if (restuple == NULL)
3856 return NULL;
3857 if (!PyTuple_Check(restuple)) {
3858 PyErr_Format(PyExc_TypeError, &argparse[4]);
3859 Py_DECREF(restuple);
3860 return NULL;
3861 }
3862 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003863 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864 Py_DECREF(restuple);
3865 return NULL;
3866 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003867 if (i_newpos<0)
3868 *newpos = size+i_newpos;
3869 else
3870 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003871 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003872 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003873 Py_DECREF(restuple);
3874 return NULL;
3875 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003876 Py_INCREF(resunicode);
3877 Py_DECREF(restuple);
3878 return resunicode;
3879}
3880
3881/* Lookup the character ch in the mapping and put the result in result,
3882 which must be decrefed by the caller.
3883 Return 0 on success, -1 on error */
3884static
3885int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3886{
3887 PyObject *w = PyInt_FromLong((long)c);
3888 PyObject *x;
3889
3890 if (w == NULL)
3891 return -1;
3892 x = PyObject_GetItem(mapping, w);
3893 Py_DECREF(w);
3894 if (x == NULL) {
3895 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3896 /* No mapping found means: use 1:1 mapping. */
3897 PyErr_Clear();
3898 *result = NULL;
3899 return 0;
3900 } else
3901 return -1;
3902 }
3903 else if (x == Py_None) {
3904 *result = x;
3905 return 0;
3906 }
3907 else if (PyInt_Check(x)) {
3908 long value = PyInt_AS_LONG(x);
3909 long max = PyUnicode_GetMax();
3910 if (value < 0 || value > max) {
3911 PyErr_Format(PyExc_TypeError,
3912 "character mapping must be in range(0x%lx)", max+1);
3913 Py_DECREF(x);
3914 return -1;
3915 }
3916 *result = x;
3917 return 0;
3918 }
3919 else if (PyUnicode_Check(x)) {
3920 *result = x;
3921 return 0;
3922 }
3923 else {
3924 /* wrong return value */
3925 PyErr_SetString(PyExc_TypeError,
3926 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003927 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003928 return -1;
3929 }
3930}
3931/* ensure that *outobj is at least requiredsize characters long,
3932if not reallocate and adjust various state variables.
3933Return 0 on success, -1 on error */
3934static
Walter Dörwald4894c302003-10-24 14:25:28 +00003935int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003936 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003937{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003938 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003939 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003940 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003941 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003942 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003943 if (requiredsize < 2 * oldsize)
3944 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003945 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946 return -1;
3947 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 }
3949 return 0;
3950}
3951/* lookup the character, put the result in the output string and adjust
3952 various state variables. Return a new reference to the object that
3953 was put in the output buffer in *result, or Py_None, if the mapping was
3954 undefined (in which case no character was written).
3955 The called must decref result.
3956 Return 0 on success, -1 on error. */
3957static
Walter Dörwald4894c302003-10-24 14:25:28 +00003958int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003959 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003960 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961{
Walter Dörwald4894c302003-10-24 14:25:28 +00003962 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003963 return -1;
3964 if (*res==NULL) {
3965 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003966 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 }
3968 else if (*res==Py_None)
3969 ;
3970 else if (PyInt_Check(*res)) {
3971 /* no overflow check, because we know that the space is enough */
3972 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3973 }
3974 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003975 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003976 if (repsize==1) {
3977 /* no overflow check, because we know that the space is enough */
3978 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3979 }
3980 else if (repsize!=0) {
3981 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003982 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003983 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003984 repsize - 1;
3985 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986 return -1;
3987 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3988 *outp += repsize;
3989 }
3990 }
3991 else
3992 return -1;
3993 return 0;
3994}
3995
3996PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003997 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 PyObject *mapping,
3999 const char *errors)
4000{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001 /* output object */
4002 PyObject *res = NULL;
4003 /* pointers to the beginning and end+1 of input */
4004 const Py_UNICODE *startp = p;
4005 const Py_UNICODE *endp = p + size;
4006 /* pointer into the output */
4007 Py_UNICODE *str;
4008 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004009 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004010 char *reason = "character maps to <undefined>";
4011 PyObject *errorHandler = NULL;
4012 PyObject *exc = NULL;
4013 /* the following variable is used for caching string comparisons
4014 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4015 * 3=ignore, 4=xmlcharrefreplace */
4016 int known_errorHandler = -1;
4017
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018 if (mapping == NULL) {
4019 PyErr_BadArgument();
4020 return NULL;
4021 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004022
4023 /* allocate enough for a simple 1:1 translation without
4024 replacements, if we need more, we'll resize */
4025 res = PyUnicode_FromUnicode(NULL, size);
4026 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004027 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 return res;
4030 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 while (p<endp) {
4033 /* try to encode it */
4034 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004035 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 goto onError;
4038 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004039 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040 if (x!=Py_None) /* it worked => adjust input pointer */
4041 ++p;
4042 else { /* untranslatable character */
4043 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004044 Py_ssize_t repsize;
4045 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004046 Py_UNICODE *uni2;
4047 /* startpos for collecting untranslatable chars */
4048 const Py_UNICODE *collstart = p;
4049 const Py_UNICODE *collend = p+1;
4050 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052 /* find all untranslatable characters */
4053 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004054 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055 goto onError;
4056 Py_XDECREF(x);
4057 if (x!=Py_None)
4058 break;
4059 ++collend;
4060 }
4061 /* cache callback name lookup
4062 * (if not done yet, i.e. it's the first error) */
4063 if (known_errorHandler==-1) {
4064 if ((errors==NULL) || (!strcmp(errors, "strict")))
4065 known_errorHandler = 1;
4066 else if (!strcmp(errors, "replace"))
4067 known_errorHandler = 2;
4068 else if (!strcmp(errors, "ignore"))
4069 known_errorHandler = 3;
4070 else if (!strcmp(errors, "xmlcharrefreplace"))
4071 known_errorHandler = 4;
4072 else
4073 known_errorHandler = 0;
4074 }
4075 switch (known_errorHandler) {
4076 case 1: /* strict */
4077 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4078 goto onError;
4079 case 2: /* replace */
4080 /* No need to check for space, this is a 1:1 replacement */
4081 for (coll = collstart; coll<collend; ++coll)
4082 *str++ = '?';
4083 /* fall through */
4084 case 3: /* ignore */
4085 p = collend;
4086 break;
4087 case 4: /* xmlcharrefreplace */
4088 /* generate replacement (temporarily (mis)uses p) */
4089 for (p = collstart; p < collend; ++p) {
4090 char buffer[2+29+1+1];
4091 char *cp;
4092 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004093 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004094 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4095 goto onError;
4096 for (cp = buffer; *cp; ++cp)
4097 *str++ = *cp;
4098 }
4099 p = collend;
4100 break;
4101 default:
4102 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4103 reason, startp, size, &exc,
4104 collstart-startp, collend-startp, &newpos);
4105 if (repunicode == NULL)
4106 goto onError;
4107 /* generate replacement */
4108 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004109 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4111 Py_DECREF(repunicode);
4112 goto onError;
4113 }
4114 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4115 *str++ = *uni2;
4116 p = startp + newpos;
4117 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118 }
4119 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 /* Resize if we allocated to much */
4122 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004123 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004124 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004125 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 }
4127 Py_XDECREF(exc);
4128 Py_XDECREF(errorHandler);
4129 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131 onError:
4132 Py_XDECREF(res);
4133 Py_XDECREF(exc);
4134 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 return NULL;
4136}
4137
4138PyObject *PyUnicode_Translate(PyObject *str,
4139 PyObject *mapping,
4140 const char *errors)
4141{
4142 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004143
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 str = PyUnicode_FromObject(str);
4145 if (str == NULL)
4146 goto onError;
4147 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4148 PyUnicode_GET_SIZE(str),
4149 mapping,
4150 errors);
4151 Py_DECREF(str);
4152 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004153
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 onError:
4155 Py_XDECREF(str);
4156 return NULL;
4157}
Tim Petersced69f82003-09-16 20:30:58 +00004158
Guido van Rossum9e896b32000-04-05 20:11:21 +00004159/* --- Decimal Encoder ---------------------------------------------------- */
4160
4161int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004162 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004163 char *output,
4164 const char *errors)
4165{
4166 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167 PyObject *errorHandler = NULL;
4168 PyObject *exc = NULL;
4169 const char *encoding = "decimal";
4170 const char *reason = "invalid decimal Unicode string";
4171 /* the following variable is used for caching string comparisons
4172 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4173 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004174
4175 if (output == NULL) {
4176 PyErr_BadArgument();
4177 return -1;
4178 }
4179
4180 p = s;
4181 end = s + length;
4182 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004184 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004186 Py_ssize_t repsize;
4187 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 Py_UNICODE *uni2;
4189 Py_UNICODE *collstart;
4190 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004191
Guido van Rossum9e896b32000-04-05 20:11:21 +00004192 if (Py_UNICODE_ISSPACE(ch)) {
4193 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004195 continue;
4196 }
4197 decimal = Py_UNICODE_TODECIMAL(ch);
4198 if (decimal >= 0) {
4199 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004201 continue;
4202 }
Guido van Rossumba477042000-04-06 18:18:10 +00004203 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004204 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004206 continue;
4207 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 /* All other characters are considered unencodable */
4209 collstart = p;
4210 collend = p+1;
4211 while (collend < end) {
4212 if ((0 < *collend && *collend < 256) ||
4213 !Py_UNICODE_ISSPACE(*collend) ||
4214 Py_UNICODE_TODECIMAL(*collend))
4215 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004216 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217 /* cache callback name lookup
4218 * (if not done yet, i.e. it's the first error) */
4219 if (known_errorHandler==-1) {
4220 if ((errors==NULL) || (!strcmp(errors, "strict")))
4221 known_errorHandler = 1;
4222 else if (!strcmp(errors, "replace"))
4223 known_errorHandler = 2;
4224 else if (!strcmp(errors, "ignore"))
4225 known_errorHandler = 3;
4226 else if (!strcmp(errors, "xmlcharrefreplace"))
4227 known_errorHandler = 4;
4228 else
4229 known_errorHandler = 0;
4230 }
4231 switch (known_errorHandler) {
4232 case 1: /* strict */
4233 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4234 goto onError;
4235 case 2: /* replace */
4236 for (p = collstart; p < collend; ++p)
4237 *output++ = '?';
4238 /* fall through */
4239 case 3: /* ignore */
4240 p = collend;
4241 break;
4242 case 4: /* xmlcharrefreplace */
4243 /* generate replacement (temporarily (mis)uses p) */
4244 for (p = collstart; p < collend; ++p)
4245 output += sprintf(output, "&#%d;", (int)*p);
4246 p = collend;
4247 break;
4248 default:
4249 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4250 encoding, reason, s, length, &exc,
4251 collstart-s, collend-s, &newpos);
4252 if (repunicode == NULL)
4253 goto onError;
4254 /* generate replacement */
4255 repsize = PyUnicode_GET_SIZE(repunicode);
4256 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4257 Py_UNICODE ch = *uni2;
4258 if (Py_UNICODE_ISSPACE(ch))
4259 *output++ = ' ';
4260 else {
4261 decimal = Py_UNICODE_TODECIMAL(ch);
4262 if (decimal >= 0)
4263 *output++ = '0' + decimal;
4264 else if (0 < ch && ch < 256)
4265 *output++ = (char)ch;
4266 else {
4267 Py_DECREF(repunicode);
4268 raise_encode_exception(&exc, encoding,
4269 s, length, collstart-s, collend-s, reason);
4270 goto onError;
4271 }
4272 }
4273 }
4274 p = s + newpos;
4275 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004276 }
4277 }
4278 /* 0-terminate the output string */
4279 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 Py_XDECREF(exc);
4281 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004282 return 0;
4283
4284 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 Py_XDECREF(exc);
4286 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004287 return -1;
4288}
4289
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290/* --- Helpers ------------------------------------------------------------ */
4291
Fredrik Lundha50d2012006-05-26 17:04:58 +00004292#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004293
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004294#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004295#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004296#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004297
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004298Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004299STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4300{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004301 if (str[0] != other[0])
4302 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004303 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4304}
4305
Fredrik Lundhb9479482006-05-26 17:22:38 +00004306#define STRINGLIB_EMPTY unicode_empty
4307
Fredrik Lundha50d2012006-05-26 17:04:58 +00004308#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004309
4310#include "stringlib/count.h"
4311#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004312#include "stringlib/partition.h"
4313
Fredrik Lundhc8162812006-05-26 19:33:03 +00004314/* helper macro to fixup start/end slice values */
4315#define FIX_START_END(obj) \
4316 if (start < 0) \
4317 start += (obj)->length; \
4318 if (start < 0) \
4319 start = 0; \
4320 if (end > (obj)->length) \
4321 end = (obj)->length; \
4322 if (end < 0) \
4323 end += (obj)->length; \
4324 if (end < 0) \
4325 end = 0;
4326
Martin v. Löwis18e16552006-02-15 17:27:45 +00004327Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004328 PyObject *substr,
4329 Py_ssize_t start,
4330 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004332 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004333 PyUnicodeObject* str_obj;
4334 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004335
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004336 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4337 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004339 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4340 if (!sub_obj) {
4341 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342 return -1;
4343 }
Tim Petersced69f82003-09-16 20:30:58 +00004344
Fredrik Lundhc8162812006-05-26 19:33:03 +00004345 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004346
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004347 result = stringlib_count(
4348 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4349 );
4350
4351 Py_DECREF(sub_obj);
4352 Py_DECREF(str_obj);
4353
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 return result;
4355}
4356
Martin v. Löwis18e16552006-02-15 17:27:45 +00004357Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004358 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004359 Py_ssize_t start,
4360 Py_ssize_t end,
4361 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004363 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004364
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004365 str = PyUnicode_FromObject(str);
4366 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004367 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004368 sub = PyUnicode_FromObject(sub);
4369 if (!sub) {
4370 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004371 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004372 }
Tim Petersced69f82003-09-16 20:30:58 +00004373
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004374 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004375 result = stringlib_find_slice(
4376 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4377 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4378 start, end
4379 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004380 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004381 result = stringlib_rfind_slice(
4382 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4383 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4384 start, end
4385 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004386
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004387 Py_DECREF(str);
4388 Py_DECREF(sub);
4389
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 return result;
4391}
4392
Tim Petersced69f82003-09-16 20:30:58 +00004393static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394int tailmatch(PyUnicodeObject *self,
4395 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004396 Py_ssize_t start,
4397 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398 int direction)
4399{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400 if (substring->length == 0)
4401 return 1;
4402
Fredrik Lundhc8162812006-05-26 19:33:03 +00004403 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404
4405 end -= substring->length;
4406 if (end < start)
4407 return 0;
4408
4409 if (direction > 0) {
4410 if (Py_UNICODE_MATCH(self, end, substring))
4411 return 1;
4412 } else {
4413 if (Py_UNICODE_MATCH(self, start, substring))
4414 return 1;
4415 }
4416
4417 return 0;
4418}
4419
Martin v. Löwis18e16552006-02-15 17:27:45 +00004420Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004422 Py_ssize_t start,
4423 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424 int direction)
4425{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004426 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004427
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428 str = PyUnicode_FromObject(str);
4429 if (str == NULL)
4430 return -1;
4431 substr = PyUnicode_FromObject(substr);
4432 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004433 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 return -1;
4435 }
Tim Petersced69f82003-09-16 20:30:58 +00004436
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 result = tailmatch((PyUnicodeObject *)str,
4438 (PyUnicodeObject *)substr,
4439 start, end, direction);
4440 Py_DECREF(str);
4441 Py_DECREF(substr);
4442 return result;
4443}
4444
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445/* Apply fixfct filter to the Unicode object self and return a
4446 reference to the modified object */
4447
Tim Petersced69f82003-09-16 20:30:58 +00004448static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449PyObject *fixup(PyUnicodeObject *self,
4450 int (*fixfct)(PyUnicodeObject *s))
4451{
4452
4453 PyUnicodeObject *u;
4454
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004455 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456 if (u == NULL)
4457 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004458
4459 Py_UNICODE_COPY(u->str, self->str, self->length);
4460
Tim Peters7a29bd52001-09-12 03:03:31 +00004461 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 /* fixfct should return TRUE if it modified the buffer. If
4463 FALSE, return a reference to the original buffer instead
4464 (to save space, not time) */
4465 Py_INCREF(self);
4466 Py_DECREF(u);
4467 return (PyObject*) self;
4468 }
4469 return (PyObject*) u;
4470}
4471
Tim Petersced69f82003-09-16 20:30:58 +00004472static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473int fixupper(PyUnicodeObject *self)
4474{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004475 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 Py_UNICODE *s = self->str;
4477 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004478
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479 while (len-- > 0) {
4480 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004481
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482 ch = Py_UNICODE_TOUPPER(*s);
4483 if (ch != *s) {
4484 status = 1;
4485 *s = ch;
4486 }
4487 s++;
4488 }
4489
4490 return status;
4491}
4492
Tim Petersced69f82003-09-16 20:30:58 +00004493static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494int fixlower(PyUnicodeObject *self)
4495{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004496 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497 Py_UNICODE *s = self->str;
4498 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004499
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 while (len-- > 0) {
4501 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004502
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503 ch = Py_UNICODE_TOLOWER(*s);
4504 if (ch != *s) {
4505 status = 1;
4506 *s = ch;
4507 }
4508 s++;
4509 }
4510
4511 return status;
4512}
4513
Tim Petersced69f82003-09-16 20:30:58 +00004514static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515int fixswapcase(PyUnicodeObject *self)
4516{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004517 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 Py_UNICODE *s = self->str;
4519 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004520
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521 while (len-- > 0) {
4522 if (Py_UNICODE_ISUPPER(*s)) {
4523 *s = Py_UNICODE_TOLOWER(*s);
4524 status = 1;
4525 } else if (Py_UNICODE_ISLOWER(*s)) {
4526 *s = Py_UNICODE_TOUPPER(*s);
4527 status = 1;
4528 }
4529 s++;
4530 }
4531
4532 return status;
4533}
4534
Tim Petersced69f82003-09-16 20:30:58 +00004535static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536int fixcapitalize(PyUnicodeObject *self)
4537{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004538 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004539 Py_UNICODE *s = self->str;
4540 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004541
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004542 if (len == 0)
4543 return 0;
4544 if (Py_UNICODE_ISLOWER(*s)) {
4545 *s = Py_UNICODE_TOUPPER(*s);
4546 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004548 s++;
4549 while (--len > 0) {
4550 if (Py_UNICODE_ISUPPER(*s)) {
4551 *s = Py_UNICODE_TOLOWER(*s);
4552 status = 1;
4553 }
4554 s++;
4555 }
4556 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557}
4558
4559static
4560int fixtitle(PyUnicodeObject *self)
4561{
4562 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4563 register Py_UNICODE *e;
4564 int previous_is_cased;
4565
4566 /* Shortcut for single character strings */
4567 if (PyUnicode_GET_SIZE(self) == 1) {
4568 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4569 if (*p != ch) {
4570 *p = ch;
4571 return 1;
4572 }
4573 else
4574 return 0;
4575 }
Tim Petersced69f82003-09-16 20:30:58 +00004576
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577 e = p + PyUnicode_GET_SIZE(self);
4578 previous_is_cased = 0;
4579 for (; p < e; p++) {
4580 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004581
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582 if (previous_is_cased)
4583 *p = Py_UNICODE_TOLOWER(ch);
4584 else
4585 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004586
4587 if (Py_UNICODE_ISLOWER(ch) ||
4588 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589 Py_UNICODE_ISTITLE(ch))
4590 previous_is_cased = 1;
4591 else
4592 previous_is_cased = 0;
4593 }
4594 return 1;
4595}
4596
Tim Peters8ce9f162004-08-27 01:49:32 +00004597PyObject *
4598PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599{
Tim Peters8ce9f162004-08-27 01:49:32 +00004600 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004601 const Py_UNICODE blank = ' ';
4602 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004603 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004604 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004605 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4606 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004607 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4608 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004609 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004610 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004611 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612
Tim Peters05eba1f2004-08-27 21:32:02 +00004613 fseq = PySequence_Fast(seq, "");
4614 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004615 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004616 }
4617
Tim Peters91879ab2004-08-27 22:35:44 +00004618 /* Grrrr. A codec may be invoked to convert str objects to
4619 * Unicode, and so it's possible to call back into Python code
4620 * during PyUnicode_FromObject(), and so it's possible for a sick
4621 * codec to change the size of fseq (if seq is a list). Therefore
4622 * we have to keep refetching the size -- can't assume seqlen
4623 * is invariant.
4624 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004625 seqlen = PySequence_Fast_GET_SIZE(fseq);
4626 /* If empty sequence, return u"". */
4627 if (seqlen == 0) {
4628 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4629 goto Done;
4630 }
4631 /* If singleton sequence with an exact Unicode, return that. */
4632 if (seqlen == 1) {
4633 item = PySequence_Fast_GET_ITEM(fseq, 0);
4634 if (PyUnicode_CheckExact(item)) {
4635 Py_INCREF(item);
4636 res = (PyUnicodeObject *)item;
4637 goto Done;
4638 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004639 }
4640
Tim Peters05eba1f2004-08-27 21:32:02 +00004641 /* At least two items to join, or one that isn't exact Unicode. */
4642 if (seqlen > 1) {
4643 /* Set up sep and seplen -- they're needed. */
4644 if (separator == NULL) {
4645 sep = &blank;
4646 seplen = 1;
4647 }
4648 else {
4649 internal_separator = PyUnicode_FromObject(separator);
4650 if (internal_separator == NULL)
4651 goto onError;
4652 sep = PyUnicode_AS_UNICODE(internal_separator);
4653 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004654 /* In case PyUnicode_FromObject() mutated seq. */
4655 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004656 }
4657 }
4658
4659 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004660 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004661 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004662 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004663 res_p = PyUnicode_AS_UNICODE(res);
4664 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004665
Tim Peters05eba1f2004-08-27 21:32:02 +00004666 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004667 Py_ssize_t itemlen;
4668 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004669
4670 item = PySequence_Fast_GET_ITEM(fseq, i);
4671 /* Convert item to Unicode. */
4672 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4673 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004674 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004675 " %.80s found",
4676 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004677 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004678 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004679 item = PyUnicode_FromObject(item);
4680 if (item == NULL)
4681 goto onError;
4682 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004683
Tim Peters91879ab2004-08-27 22:35:44 +00004684 /* In case PyUnicode_FromObject() mutated seq. */
4685 seqlen = PySequence_Fast_GET_SIZE(fseq);
4686
Tim Peters8ce9f162004-08-27 01:49:32 +00004687 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004689 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004690 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004691 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004692 if (i < seqlen - 1) {
4693 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004694 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004695 goto Overflow;
4696 }
4697 if (new_res_used > res_alloc) {
4698 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004699 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004700 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004701 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004702 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004703 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004704 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004705 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004707 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004708 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004710
4711 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004712 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004713 res_p += itemlen;
4714 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004715 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004716 res_p += seplen;
4717 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004719 res_used = new_res_used;
4720 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004721
Tim Peters05eba1f2004-08-27 21:32:02 +00004722 /* Shrink res to match the used area; this probably can't fail,
4723 * but it's cheap to check.
4724 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004725 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004726 goto onError;
4727
4728 Done:
4729 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004730 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 return (PyObject *)res;
4732
Tim Peters8ce9f162004-08-27 01:49:32 +00004733 Overflow:
4734 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004735 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004736 Py_DECREF(item);
4737 /* fall through */
4738
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004740 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004741 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004742 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 return NULL;
4744}
4745
Tim Petersced69f82003-09-16 20:30:58 +00004746static
4747PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004748 Py_ssize_t left,
4749 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 Py_UNICODE fill)
4751{
4752 PyUnicodeObject *u;
4753
4754 if (left < 0)
4755 left = 0;
4756 if (right < 0)
4757 right = 0;
4758
Tim Peters7a29bd52001-09-12 03:03:31 +00004759 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760 Py_INCREF(self);
4761 return self;
4762 }
4763
4764 u = _PyUnicode_New(left + self->length + right);
4765 if (u) {
4766 if (left)
4767 Py_UNICODE_FILL(u->str, fill, left);
4768 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4769 if (right)
4770 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4771 }
4772
4773 return u;
4774}
4775
4776#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004777 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 if (!str) \
4779 goto onError; \
4780 if (PyList_Append(list, str)) { \
4781 Py_DECREF(str); \
4782 goto onError; \
4783 } \
4784 else \
4785 Py_DECREF(str);
4786
4787static
4788PyObject *split_whitespace(PyUnicodeObject *self,
4789 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004790 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004792 register Py_ssize_t i;
4793 register Py_ssize_t j;
4794 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795 PyObject *str;
4796
4797 for (i = j = 0; i < len; ) {
4798 /* find a token */
4799 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4800 i++;
4801 j = i;
4802 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4803 i++;
4804 if (j < i) {
4805 if (maxcount-- <= 0)
4806 break;
4807 SPLIT_APPEND(self->str, j, i);
4808 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4809 i++;
4810 j = i;
4811 }
4812 }
4813 if (j < len) {
4814 SPLIT_APPEND(self->str, j, len);
4815 }
4816 return list;
4817
4818 onError:
4819 Py_DECREF(list);
4820 return NULL;
4821}
4822
4823PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004824 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004826 register Py_ssize_t i;
4827 register Py_ssize_t j;
4828 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 PyObject *list;
4830 PyObject *str;
4831 Py_UNICODE *data;
4832
4833 string = PyUnicode_FromObject(string);
4834 if (string == NULL)
4835 return NULL;
4836 data = PyUnicode_AS_UNICODE(string);
4837 len = PyUnicode_GET_SIZE(string);
4838
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 list = PyList_New(0);
4840 if (!list)
4841 goto onError;
4842
4843 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004844 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004847 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849
4850 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004851 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 if (i < len) {
4853 if (data[i] == '\r' && i + 1 < len &&
4854 data[i+1] == '\n')
4855 i += 2;
4856 else
4857 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004858 if (keepends)
4859 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860 }
Guido van Rossum86662912000-04-11 15:38:46 +00004861 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 j = i;
4863 }
4864 if (j < len) {
4865 SPLIT_APPEND(data, j, len);
4866 }
4867
4868 Py_DECREF(string);
4869 return list;
4870
4871 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004872 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873 Py_DECREF(string);
4874 return NULL;
4875}
4876
Tim Petersced69f82003-09-16 20:30:58 +00004877static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878PyObject *split_char(PyUnicodeObject *self,
4879 PyObject *list,
4880 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004881 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004883 register Py_ssize_t i;
4884 register Py_ssize_t j;
4885 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 PyObject *str;
4887
4888 for (i = j = 0; i < len; ) {
4889 if (self->str[i] == ch) {
4890 if (maxcount-- <= 0)
4891 break;
4892 SPLIT_APPEND(self->str, j, i);
4893 i = j = i + 1;
4894 } else
4895 i++;
4896 }
4897 if (j <= len) {
4898 SPLIT_APPEND(self->str, j, len);
4899 }
4900 return list;
4901
4902 onError:
4903 Py_DECREF(list);
4904 return NULL;
4905}
4906
Tim Petersced69f82003-09-16 20:30:58 +00004907static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908PyObject *split_substring(PyUnicodeObject *self,
4909 PyObject *list,
4910 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004911 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004913 register Py_ssize_t i;
4914 register Py_ssize_t j;
4915 Py_ssize_t len = self->length;
4916 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 PyObject *str;
4918
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004919 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920 if (Py_UNICODE_MATCH(self, i, substring)) {
4921 if (maxcount-- <= 0)
4922 break;
4923 SPLIT_APPEND(self->str, j, i);
4924 i = j = i + sublen;
4925 } else
4926 i++;
4927 }
4928 if (j <= len) {
4929 SPLIT_APPEND(self->str, j, len);
4930 }
4931 return list;
4932
4933 onError:
4934 Py_DECREF(list);
4935 return NULL;
4936}
4937
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004938static
4939PyObject *rsplit_whitespace(PyUnicodeObject *self,
4940 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004941 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004942{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004943 register Py_ssize_t i;
4944 register Py_ssize_t j;
4945 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004946 PyObject *str;
4947
4948 for (i = j = len - 1; i >= 0; ) {
4949 /* find a token */
4950 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4951 i--;
4952 j = i;
4953 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4954 i--;
4955 if (j > i) {
4956 if (maxcount-- <= 0)
4957 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004958 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004959 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4960 i--;
4961 j = i;
4962 }
4963 }
4964 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004965 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004966 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004967 if (PyList_Reverse(list) < 0)
4968 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004969 return list;
4970
4971 onError:
4972 Py_DECREF(list);
4973 return NULL;
4974}
4975
4976static
4977PyObject *rsplit_char(PyUnicodeObject *self,
4978 PyObject *list,
4979 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004980 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004981{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004982 register Py_ssize_t i;
4983 register Py_ssize_t j;
4984 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004985 PyObject *str;
4986
4987 for (i = j = len - 1; i >= 0; ) {
4988 if (self->str[i] == ch) {
4989 if (maxcount-- <= 0)
4990 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004991 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004992 j = i = i - 1;
4993 } else
4994 i--;
4995 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004996 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004997 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004998 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004999 if (PyList_Reverse(list) < 0)
5000 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005001 return list;
5002
5003 onError:
5004 Py_DECREF(list);
5005 return NULL;
5006}
5007
5008static
5009PyObject *rsplit_substring(PyUnicodeObject *self,
5010 PyObject *list,
5011 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005012 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005013{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005014 register Py_ssize_t i;
5015 register Py_ssize_t j;
5016 Py_ssize_t len = self->length;
5017 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005018 PyObject *str;
5019
5020 for (i = len - sublen, j = len; i >= 0; ) {
5021 if (Py_UNICODE_MATCH(self, i, substring)) {
5022 if (maxcount-- <= 0)
5023 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005024 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005025 j = i;
5026 i -= sublen;
5027 } else
5028 i--;
5029 }
5030 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005031 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005032 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005033 if (PyList_Reverse(list) < 0)
5034 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005035 return list;
5036
5037 onError:
5038 Py_DECREF(list);
5039 return NULL;
5040}
5041
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042#undef SPLIT_APPEND
5043
5044static
5045PyObject *split(PyUnicodeObject *self,
5046 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005047 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048{
5049 PyObject *list;
5050
5051 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005052 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053
5054 list = PyList_New(0);
5055 if (!list)
5056 return NULL;
5057
5058 if (substring == NULL)
5059 return split_whitespace(self,list,maxcount);
5060
5061 else if (substring->length == 1)
5062 return split_char(self,list,substring->str[0],maxcount);
5063
5064 else if (substring->length == 0) {
5065 Py_DECREF(list);
5066 PyErr_SetString(PyExc_ValueError, "empty separator");
5067 return NULL;
5068 }
5069 else
5070 return split_substring(self,list,substring,maxcount);
5071}
5072
Tim Petersced69f82003-09-16 20:30:58 +00005073static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005074PyObject *rsplit(PyUnicodeObject *self,
5075 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005076 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005077{
5078 PyObject *list;
5079
5080 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005081 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005082
5083 list = PyList_New(0);
5084 if (!list)
5085 return NULL;
5086
5087 if (substring == NULL)
5088 return rsplit_whitespace(self,list,maxcount);
5089
5090 else if (substring->length == 1)
5091 return rsplit_char(self,list,substring->str[0],maxcount);
5092
5093 else if (substring->length == 0) {
5094 Py_DECREF(list);
5095 PyErr_SetString(PyExc_ValueError, "empty separator");
5096 return NULL;
5097 }
5098 else
5099 return rsplit_substring(self,list,substring,maxcount);
5100}
5101
5102static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103PyObject *replace(PyUnicodeObject *self,
5104 PyUnicodeObject *str1,
5105 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005106 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107{
5108 PyUnicodeObject *u;
5109
5110 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005111 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112
Fredrik Lundh347ee272006-05-24 16:35:18 +00005113 if (str1->length == str2->length) {
5114 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005115 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005116 if (str1->length == 1) {
5117 /* replace characters */
5118 Py_UNICODE u1, u2;
5119 if (!findchar(self->str, self->length, str1->str[0]))
5120 goto nothing;
5121 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5122 if (!u)
5123 return NULL;
5124 Py_UNICODE_COPY(u->str, self->str, self->length);
5125 u1 = str1->str[0];
5126 u2 = str2->str[0];
5127 for (i = 0; i < u->length; i++)
5128 if (u->str[i] == u1) {
5129 if (--maxcount < 0)
5130 break;
5131 u->str[i] = u2;
5132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005134 i = fastsearch(
5135 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005137 if (i < 0)
5138 goto nothing;
5139 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5140 if (!u)
5141 return NULL;
5142 Py_UNICODE_COPY(u->str, self->str, self->length);
5143 while (i <= self->length - str1->length)
5144 if (Py_UNICODE_MATCH(self, i, str1)) {
5145 if (--maxcount < 0)
5146 break;
5147 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5148 i += str1->length;
5149 } else
5150 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005153
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005154 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005155 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156 Py_UNICODE *p;
5157
5158 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005159 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160 if (n > maxcount)
5161 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005162 if (n == 0)
5163 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005164 /* new_size = self->length + n * (str2->length - str1->length)); */
5165 delta = (str2->length - str1->length);
5166 if (delta == 0) {
5167 new_size = self->length;
5168 } else {
5169 product = n * (str2->length - str1->length);
5170 if ((product / (str2->length - str1->length)) != n) {
5171 PyErr_SetString(PyExc_OverflowError,
5172 "replace string is too long");
5173 return NULL;
5174 }
5175 new_size = self->length + product;
5176 if (new_size < 0) {
5177 PyErr_SetString(PyExc_OverflowError,
5178 "replace string is too long");
5179 return NULL;
5180 }
5181 }
5182 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005183 if (!u)
5184 return NULL;
5185 i = 0;
5186 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005187 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005188 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005189 while (n-- > 0) {
5190 /* look for next match */
5191 j = i;
5192 while (j <= e) {
5193 if (Py_UNICODE_MATCH(self, j, str1))
5194 break;
5195 j++;
5196 }
5197 if (j > i) {
5198 if (j > e)
5199 break;
5200 /* copy unchanged part [i:j] */
5201 Py_UNICODE_COPY(p, self->str+i, j-i);
5202 p += j - i;
5203 }
5204 /* copy substitution string */
5205 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005206 Py_UNICODE_COPY(p, str2->str, str2->length);
5207 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005208 }
5209 i = j + str1->length;
5210 }
5211 if (i < self->length)
5212 /* copy tail [i:] */
5213 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005214 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005215 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005216 while (n > 0) {
5217 Py_UNICODE_COPY(p, str2->str, str2->length);
5218 p += str2->length;
5219 if (--n <= 0)
5220 break;
5221 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005223 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224 }
5225 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005227
5228nothing:
5229 /* nothing to replace; return original string (when possible) */
5230 if (PyUnicode_CheckExact(self)) {
5231 Py_INCREF(self);
5232 return (PyObject *) self;
5233 }
5234 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235}
5236
5237/* --- Unicode Object Methods --------------------------------------------- */
5238
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005239PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240"S.title() -> unicode\n\
5241\n\
5242Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005243characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244
5245static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005246unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248 return fixup(self, fixtitle);
5249}
5250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005251PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252"S.capitalize() -> unicode\n\
5253\n\
5254Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005255have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256
5257static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005258unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260 return fixup(self, fixcapitalize);
5261}
5262
5263#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005264PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265"S.capwords() -> unicode\n\
5266\n\
5267Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005268normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269
5270static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005271unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272{
5273 PyObject *list;
5274 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005275 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 /* Split into words */
5278 list = split(self, NULL, -1);
5279 if (!list)
5280 return NULL;
5281
5282 /* Capitalize each word */
5283 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5284 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5285 fixcapitalize);
5286 if (item == NULL)
5287 goto onError;
5288 Py_DECREF(PyList_GET_ITEM(list, i));
5289 PyList_SET_ITEM(list, i, item);
5290 }
5291
5292 /* Join the words to form a new string */
5293 item = PyUnicode_Join(NULL, list);
5294
5295onError:
5296 Py_DECREF(list);
5297 return (PyObject *)item;
5298}
5299#endif
5300
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005301/* Argument converter. Coerces to a single unicode character */
5302
5303static int
5304convert_uc(PyObject *obj, void *addr)
5305{
5306 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5307 PyObject *uniobj;
5308 Py_UNICODE *unistr;
5309
5310 uniobj = PyUnicode_FromObject(obj);
5311 if (uniobj == NULL) {
5312 PyErr_SetString(PyExc_TypeError,
5313 "The fill character cannot be converted to Unicode");
5314 return 0;
5315 }
5316 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5317 PyErr_SetString(PyExc_TypeError,
5318 "The fill character must be exactly one character long");
5319 Py_DECREF(uniobj);
5320 return 0;
5321 }
5322 unistr = PyUnicode_AS_UNICODE(uniobj);
5323 *fillcharloc = unistr[0];
5324 Py_DECREF(uniobj);
5325 return 1;
5326}
5327
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005328PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005329"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005331Return S centered in a Unicode string of length width. Padding is\n\
5332done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333
5334static PyObject *
5335unicode_center(PyUnicodeObject *self, PyObject *args)
5336{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005337 Py_ssize_t marg, left;
5338 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005339 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340
Thomas Woutersde017742006-02-16 19:34:37 +00005341 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 return NULL;
5343
Tim Peters7a29bd52001-09-12 03:03:31 +00005344 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 Py_INCREF(self);
5346 return (PyObject*) self;
5347 }
5348
5349 marg = width - self->length;
5350 left = marg / 2 + (marg & width & 1);
5351
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005352 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353}
5354
Marc-André Lemburge5034372000-08-08 08:04:29 +00005355#if 0
5356
5357/* This code should go into some future Unicode collation support
5358 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005359 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005360
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005361/* speedy UTF-16 code point order comparison */
5362/* gleaned from: */
5363/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5364
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005365static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005366{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005367 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005368 0, 0, 0, 0, 0, 0, 0, 0,
5369 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005370 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005371};
5372
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373static int
5374unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5375{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005376 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005377
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 Py_UNICODE *s1 = str1->str;
5379 Py_UNICODE *s2 = str2->str;
5380
5381 len1 = str1->length;
5382 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005383
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005385 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005386
5387 c1 = *s1++;
5388 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005389
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005390 if (c1 > (1<<11) * 26)
5391 c1 += utf16Fixup[c1>>11];
5392 if (c2 > (1<<11) * 26)
5393 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005394 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005395
5396 if (c1 != c2)
5397 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005398
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005399 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 }
5401
5402 return (len1 < len2) ? -1 : (len1 != len2);
5403}
5404
Marc-André Lemburge5034372000-08-08 08:04:29 +00005405#else
5406
5407static int
5408unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5409{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005410 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005411
5412 Py_UNICODE *s1 = str1->str;
5413 Py_UNICODE *s2 = str2->str;
5414
5415 len1 = str1->length;
5416 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005417
Marc-André Lemburge5034372000-08-08 08:04:29 +00005418 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005419 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005420
Fredrik Lundh45714e92001-06-26 16:39:36 +00005421 c1 = *s1++;
5422 c2 = *s2++;
5423
5424 if (c1 != c2)
5425 return (c1 < c2) ? -1 : 1;
5426
Marc-André Lemburge5034372000-08-08 08:04:29 +00005427 len1--; len2--;
5428 }
5429
5430 return (len1 < len2) ? -1 : (len1 != len2);
5431}
5432
5433#endif
5434
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435int PyUnicode_Compare(PyObject *left,
5436 PyObject *right)
5437{
5438 PyUnicodeObject *u = NULL, *v = NULL;
5439 int result;
5440
5441 /* Coerce the two arguments */
5442 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5443 if (u == NULL)
5444 goto onError;
5445 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5446 if (v == NULL)
5447 goto onError;
5448
Thomas Wouters7e474022000-07-16 12:04:32 +00005449 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 if (v == u) {
5451 Py_DECREF(u);
5452 Py_DECREF(v);
5453 return 0;
5454 }
5455
5456 result = unicode_compare(u, v);
5457
5458 Py_DECREF(u);
5459 Py_DECREF(v);
5460 return result;
5461
5462onError:
5463 Py_XDECREF(u);
5464 Py_XDECREF(v);
5465 return -1;
5466}
5467
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00005468PyObject *PyUnicode_RichCompare(PyObject *left,
5469 PyObject *right,
5470 int op)
5471{
5472 int result;
5473
5474 result = PyUnicode_Compare(left, right);
5475 if (result == -1 && PyErr_Occurred())
5476 goto onError;
5477
5478 /* Convert the return value to a Boolean */
5479 switch (op) {
5480 case Py_EQ:
5481 result = (result == 0);
5482 break;
5483 case Py_NE:
5484 result = (result != 0);
5485 break;
5486 case Py_LE:
5487 result = (result <= 0);
5488 break;
5489 case Py_GE:
5490 result = (result >= 0);
5491 break;
5492 case Py_LT:
5493 result = (result == -1);
5494 break;
5495 case Py_GT:
5496 result = (result == 1);
5497 break;
5498 }
5499 return PyBool_FromLong(result);
5500
5501 onError:
5502
5503 /* Standard case
5504
5505 Type errors mean that PyUnicode_FromObject() could not convert
5506 one of the arguments (usually the right hand side) to Unicode,
5507 ie. we can't handle the comparison request. However, it is
5508 possible that the other object knows a comparison method, which
5509 is why we return Py_NotImplemented to give the other object a
5510 chance.
5511
5512 */
5513 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5514 PyErr_Clear();
5515 Py_INCREF(Py_NotImplemented);
5516 return Py_NotImplemented;
5517 }
5518 if (op != Py_EQ && op != Py_NE)
5519 return NULL;
5520
5521 /* Equality comparison.
5522
5523 This is a special case: we silence any PyExc_UnicodeDecodeError
5524 and instead turn it into a PyErr_UnicodeWarning.
5525
5526 */
5527 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5528 return NULL;
5529 PyErr_Clear();
5530 if (PyErr_Warn(PyExc_UnicodeWarning,
5531 (op == Py_EQ) ?
5532 "Unicode equal comparison "
5533 "failed to convert both arguments to Unicode - "
5534 "interpreting them as being unequal" :
5535 "Unicode unequal comparison "
5536 "failed to convert both arguments to Unicode - "
5537 "interpreting them as being unequal"
5538 ) < 0)
5539 return NULL;
5540 result = (op == Py_NE);
5541 return PyBool_FromLong(result);
5542}
5543
Guido van Rossum403d68b2000-03-13 15:55:09 +00005544int PyUnicode_Contains(PyObject *container,
5545 PyObject *element)
5546{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005547 PyObject *str, *sub;
5548 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005549
5550 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005551 sub = PyUnicode_FromObject(element);
5552 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005553 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005554 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005555 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005556 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005557
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005558 str = PyUnicode_FromObject(container);
5559 if (!str) {
5560 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005561 return -1;
5562 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005563
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005564 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005565
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005566 Py_DECREF(str);
5567 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005568
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005569 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005570}
5571
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572/* Concat to string or Unicode object giving a new Unicode object. */
5573
5574PyObject *PyUnicode_Concat(PyObject *left,
5575 PyObject *right)
5576{
5577 PyUnicodeObject *u = NULL, *v = NULL, *w;
5578
5579 /* Coerce the two arguments */
5580 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5581 if (u == NULL)
5582 goto onError;
5583 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5584 if (v == NULL)
5585 goto onError;
5586
5587 /* Shortcuts */
5588 if (v == unicode_empty) {
5589 Py_DECREF(v);
5590 return (PyObject *)u;
5591 }
5592 if (u == unicode_empty) {
5593 Py_DECREF(u);
5594 return (PyObject *)v;
5595 }
5596
5597 /* Concat the two Unicode strings */
5598 w = _PyUnicode_New(u->length + v->length);
5599 if (w == NULL)
5600 goto onError;
5601 Py_UNICODE_COPY(w->str, u->str, u->length);
5602 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5603
5604 Py_DECREF(u);
5605 Py_DECREF(v);
5606 return (PyObject *)w;
5607
5608onError:
5609 Py_XDECREF(u);
5610 Py_XDECREF(v);
5611 return NULL;
5612}
5613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005614PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615"S.count(sub[, start[, end]]) -> int\n\
5616\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005617Return the number of non-overlapping occurrences of substring sub in\n\
5618Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005619interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620
5621static PyObject *
5622unicode_count(PyUnicodeObject *self, PyObject *args)
5623{
5624 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005625 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005626 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 PyObject *result;
5628
Guido van Rossumb8872e62000-05-09 14:14:27 +00005629 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5630 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 return NULL;
5632
5633 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005634 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 if (substring == NULL)
5636 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005637
Fredrik Lundhc8162812006-05-26 19:33:03 +00005638 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005640 result = PyInt_FromSsize_t(
5641 stringlib_count(self->str + start, end - start,
5642 substring->str, substring->length)
5643 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644
5645 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005646
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 return result;
5648}
5649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005650PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005651"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005653Encodes S using the codec registered for encoding. encoding defaults\n\
5654to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005655handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5657'xmlcharrefreplace' as well as any other name registered with\n\
5658codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659
5660static PyObject *
5661unicode_encode(PyUnicodeObject *self, PyObject *args)
5662{
5663 char *encoding = NULL;
5664 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005665 PyObject *v;
5666
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5668 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005669 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005670 if (v == NULL)
5671 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005672 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5673 PyErr_Format(PyExc_TypeError,
5674 "encoder did not return a string/unicode object "
5675 "(type=%.400s)",
5676 v->ob_type->tp_name);
5677 Py_DECREF(v);
5678 return NULL;
5679 }
5680 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005681
5682 onError:
5683 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005684}
5685
5686PyDoc_STRVAR(decode__doc__,
5687"S.decode([encoding[,errors]]) -> string or unicode\n\
5688\n\
5689Decodes S using the codec registered for encoding. encoding defaults\n\
5690to the default encoding. errors may be given to set a different error\n\
5691handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5692a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5693as well as any other name registerd with codecs.register_error that is\n\
5694able to handle UnicodeDecodeErrors.");
5695
5696static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005697unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005698{
5699 char *encoding = NULL;
5700 char *errors = NULL;
5701 PyObject *v;
5702
5703 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5704 return NULL;
5705 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005706 if (v == NULL)
5707 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005708 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5709 PyErr_Format(PyExc_TypeError,
5710 "decoder did not return a string/unicode object "
5711 "(type=%.400s)",
5712 v->ob_type->tp_name);
5713 Py_DECREF(v);
5714 return NULL;
5715 }
5716 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005717
5718 onError:
5719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720}
5721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005722PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723"S.expandtabs([tabsize]) -> unicode\n\
5724\n\
5725Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005726If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
5728static PyObject*
5729unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5730{
5731 Py_UNICODE *e;
5732 Py_UNICODE *p;
5733 Py_UNICODE *q;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005734 Py_UNICODE *qe;
5735 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 PyUnicodeObject *u;
5737 int tabsize = 8;
5738
5739 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5740 return NULL;
5741
Thomas Wouters7e474022000-07-16 12:04:32 +00005742 /* First pass: determine size of output string */
Guido van Rossum44a93e52008-03-11 21:14:54 +00005743 i = 0; /* chars up to and including most recent \n or \r */
5744 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
5745 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 for (p = self->str; p < e; p++)
5747 if (*p == '\t') {
Neal Norwitz66e64e22007-06-09 04:06:30 +00005748 if (tabsize > 0) {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005749 incr = tabsize - (j % tabsize); /* cannot overflow */
5750 if (j > PY_SSIZE_T_MAX - incr)
5751 goto overflow1;
5752 j += incr;
5753 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 }
5755 else {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005756 if (j > PY_SSIZE_T_MAX - 1)
5757 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 j++;
5759 if (*p == '\n' || *p == '\r') {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005760 if (i > PY_SSIZE_T_MAX - j)
5761 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 i += j;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005763 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 }
5765 }
5766
Guido van Rossum44a93e52008-03-11 21:14:54 +00005767 if (i > PY_SSIZE_T_MAX - j)
5768 goto overflow1;
Neal Norwitz66e64e22007-06-09 04:06:30 +00005769
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 /* Second pass: create output string and fill it */
5771 u = _PyUnicode_New(i + j);
5772 if (!u)
5773 return NULL;
5774
Guido van Rossum44a93e52008-03-11 21:14:54 +00005775 j = 0; /* same as in first pass */
5776 q = u->str; /* next output char */
5777 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778
5779 for (p = self->str; p < e; p++)
5780 if (*p == '\t') {
5781 if (tabsize > 0) {
5782 i = tabsize - (j % tabsize);
5783 j += i;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005784 while (i--) {
5785 if (q >= qe)
5786 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 *q++ = ' ';
Guido van Rossum44a93e52008-03-11 21:14:54 +00005788 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 }
5790 }
5791 else {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005792 if (q >= qe)
5793 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 *q++ = *p;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005795 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 if (*p == '\n' || *p == '\r')
5797 j = 0;
5798 }
5799
5800 return (PyObject*) u;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005801
5802 overflow2:
5803 Py_DECREF(u);
5804 overflow1:
5805 PyErr_SetString(PyExc_OverflowError, "new string is too long");
5806 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807}
5808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005809PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810"S.find(sub [,start [,end]]) -> int\n\
5811\n\
5812Return the lowest index in S where substring sub is found,\n\
Georg Brandlb4d100c2007-07-29 17:37:22 +00005813such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814arguments start and end are interpreted as in slice notation.\n\
5815\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005816Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
5818static PyObject *
5819unicode_find(PyUnicodeObject *self, PyObject *args)
5820{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005821 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005822 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005823 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005824 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825
Guido van Rossumb8872e62000-05-09 14:14:27 +00005826 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5827 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005829 substring = PyUnicode_FromObject(substring);
5830 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 return NULL;
5832
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005833 result = stringlib_find_slice(
5834 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5835 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5836 start, end
5837 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838
5839 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005840
5841 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842}
5843
5844static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005845unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846{
5847 if (index < 0 || index >= self->length) {
5848 PyErr_SetString(PyExc_IndexError, "string index out of range");
5849 return NULL;
5850 }
5851
5852 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5853}
5854
5855static long
5856unicode_hash(PyUnicodeObject *self)
5857{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005858 /* Since Unicode objects compare equal to their ASCII string
5859 counterparts, they should use the individual character values
5860 as basis for their hash value. This is needed to assure that
5861 strings and Unicode objects behave in the same way as
5862 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863
Martin v. Löwis18e16552006-02-15 17:27:45 +00005864 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005865 register Py_UNICODE *p;
5866 register long x;
5867
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 if (self->hash != -1)
5869 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005870 len = PyUnicode_GET_SIZE(self);
5871 p = PyUnicode_AS_UNICODE(self);
5872 x = *p << 7;
5873 while (--len >= 0)
5874 x = (1000003*x) ^ *p++;
5875 x ^= PyUnicode_GET_SIZE(self);
5876 if (x == -1)
5877 x = -2;
5878 self->hash = x;
5879 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880}
5881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005882PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883"S.index(sub [,start [,end]]) -> int\n\
5884\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005885Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886
5887static PyObject *
5888unicode_index(PyUnicodeObject *self, PyObject *args)
5889{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005890 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005891 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005892 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005893 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894
Guido van Rossumb8872e62000-05-09 14:14:27 +00005895 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5896 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005898 substring = PyUnicode_FromObject(substring);
5899 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 return NULL;
5901
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005902 result = stringlib_find_slice(
5903 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5904 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5905 start, end
5906 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907
5908 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005909
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 if (result < 0) {
5911 PyErr_SetString(PyExc_ValueError, "substring not found");
5912 return NULL;
5913 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005914
Martin v. Löwis18e16552006-02-15 17:27:45 +00005915 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916}
5917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005918PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005919"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005921Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005922at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923
5924static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005925unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926{
5927 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5928 register const Py_UNICODE *e;
5929 int cased;
5930
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 /* Shortcut for single character strings */
5932 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005933 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005935 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005936 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005937 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005938
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 e = p + PyUnicode_GET_SIZE(self);
5940 cased = 0;
5941 for (; p < e; p++) {
5942 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005943
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005945 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 else if (!cased && Py_UNICODE_ISLOWER(ch))
5947 cased = 1;
5948 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005949 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950}
5951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005952PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005953"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005955Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005956at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957
5958static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005959unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960{
5961 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5962 register const Py_UNICODE *e;
5963 int cased;
5964
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 /* Shortcut for single character strings */
5966 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005967 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005969 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005970 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005971 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005972
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 e = p + PyUnicode_GET_SIZE(self);
5974 cased = 0;
5975 for (; p < e; p++) {
5976 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005977
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005979 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 else if (!cased && Py_UNICODE_ISUPPER(ch))
5981 cased = 1;
5982 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005983 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984}
5985
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005986PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005987"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005989Return True if S is a titlecased string and there is at least one\n\
5990character in S, i.e. upper- and titlecase characters may only\n\
5991follow uncased characters and lowercase characters only cased ones.\n\
5992Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993
5994static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005995unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996{
5997 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5998 register const Py_UNICODE *e;
5999 int cased, previous_is_cased;
6000
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 /* Shortcut for single character strings */
6002 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006003 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6004 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006006 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006007 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006008 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006009
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 e = p + PyUnicode_GET_SIZE(self);
6011 cased = 0;
6012 previous_is_cased = 0;
6013 for (; p < e; p++) {
6014 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006015
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6017 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006018 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 previous_is_cased = 1;
6020 cased = 1;
6021 }
6022 else if (Py_UNICODE_ISLOWER(ch)) {
6023 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006024 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 previous_is_cased = 1;
6026 cased = 1;
6027 }
6028 else
6029 previous_is_cased = 0;
6030 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006031 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032}
6033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006034PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006035"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006037Return True if all characters in S are whitespace\n\
6038and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039
6040static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006041unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042{
6043 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6044 register const Py_UNICODE *e;
6045
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 /* Shortcut for single character strings */
6047 if (PyUnicode_GET_SIZE(self) == 1 &&
6048 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006049 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006051 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006052 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006053 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006054
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 e = p + PyUnicode_GET_SIZE(self);
6056 for (; p < e; p++) {
6057 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006058 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006060 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061}
6062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006063PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006064"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006065\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006066Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006067and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006068
6069static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006070unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006071{
6072 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6073 register const Py_UNICODE *e;
6074
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006075 /* Shortcut for single character strings */
6076 if (PyUnicode_GET_SIZE(self) == 1 &&
6077 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006078 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006079
6080 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006081 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006082 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006083
6084 e = p + PyUnicode_GET_SIZE(self);
6085 for (; p < e; p++) {
6086 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006087 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006088 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006089 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006090}
6091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006092PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006093"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006094\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006095Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006096and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006097
6098static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006099unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006100{
6101 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6102 register const Py_UNICODE *e;
6103
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006104 /* Shortcut for single character strings */
6105 if (PyUnicode_GET_SIZE(self) == 1 &&
6106 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006107 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006108
6109 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006110 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006111 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006112
6113 e = p + PyUnicode_GET_SIZE(self);
6114 for (; p < e; p++) {
6115 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006116 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006117 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006118 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006119}
6120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006121PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006122"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006124Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006125False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126
6127static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006128unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129{
6130 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6131 register const Py_UNICODE *e;
6132
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 /* Shortcut for single character strings */
6134 if (PyUnicode_GET_SIZE(self) == 1 &&
6135 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006136 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006138 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006139 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006140 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006141
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 e = p + PyUnicode_GET_SIZE(self);
6143 for (; p < e; p++) {
6144 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006145 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006147 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148}
6149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006150PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006151"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006153Return True if all characters in S are digits\n\
6154and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155
6156static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006157unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158{
6159 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6160 register const Py_UNICODE *e;
6161
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 /* Shortcut for single character strings */
6163 if (PyUnicode_GET_SIZE(self) == 1 &&
6164 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006165 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006167 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006168 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006169 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006170
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 e = p + PyUnicode_GET_SIZE(self);
6172 for (; p < e; p++) {
6173 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006174 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006176 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177}
6178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006179PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006180"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006182Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006183False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184
6185static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006186unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187{
6188 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6189 register const Py_UNICODE *e;
6190
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191 /* Shortcut for single character strings */
6192 if (PyUnicode_GET_SIZE(self) == 1 &&
6193 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006194 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006196 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006197 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006198 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006199
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 e = p + PyUnicode_GET_SIZE(self);
6201 for (; p < e; p++) {
6202 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006203 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006205 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206}
6207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006208PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209"S.join(sequence) -> unicode\n\
6210\n\
6211Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006212sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213
6214static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006215unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006217 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218}
6219
Martin v. Löwis18e16552006-02-15 17:27:45 +00006220static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221unicode_length(PyUnicodeObject *self)
6222{
6223 return self->length;
6224}
6225
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006226PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006227"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228\n\
6229Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006230done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231
6232static PyObject *
6233unicode_ljust(PyUnicodeObject *self, PyObject *args)
6234{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006235 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006236 Py_UNICODE fillchar = ' ';
6237
Martin v. Löwis412fb672006-04-13 06:34:32 +00006238 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 return NULL;
6240
Tim Peters7a29bd52001-09-12 03:03:31 +00006241 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 Py_INCREF(self);
6243 return (PyObject*) self;
6244 }
6245
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006246 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247}
6248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006249PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250"S.lower() -> unicode\n\
6251\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006252Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253
6254static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006255unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 return fixup(self, fixlower);
6258}
6259
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006260#define LEFTSTRIP 0
6261#define RIGHTSTRIP 1
6262#define BOTHSTRIP 2
6263
6264/* Arrays indexed by above */
6265static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6266
6267#define STRIPNAME(i) (stripformat[i]+3)
6268
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006269/* externally visible for str.strip(unicode) */
6270PyObject *
6271_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6272{
6273 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006274 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006275 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006276 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6277 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006278
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006279 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6280
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006281 i = 0;
6282 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006283 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6284 i++;
6285 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006286 }
6287
6288 j = len;
6289 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006290 do {
6291 j--;
6292 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6293 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006294 }
6295
6296 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006297 Py_INCREF(self);
6298 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006299 }
6300 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006301 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006302}
6303
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304
6305static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006306do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006308 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006309 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006310
6311 i = 0;
6312 if (striptype != RIGHTSTRIP) {
6313 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6314 i++;
6315 }
6316 }
6317
6318 j = len;
6319 if (striptype != LEFTSTRIP) {
6320 do {
6321 j--;
6322 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6323 j++;
6324 }
6325
6326 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6327 Py_INCREF(self);
6328 return (PyObject*)self;
6329 }
6330 else
6331 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332}
6333
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006334
6335static PyObject *
6336do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6337{
6338 PyObject *sep = NULL;
6339
6340 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6341 return NULL;
6342
6343 if (sep != NULL && sep != Py_None) {
6344 if (PyUnicode_Check(sep))
6345 return _PyUnicode_XStrip(self, striptype, sep);
6346 else if (PyString_Check(sep)) {
6347 PyObject *res;
6348 sep = PyUnicode_FromObject(sep);
6349 if (sep==NULL)
6350 return NULL;
6351 res = _PyUnicode_XStrip(self, striptype, sep);
6352 Py_DECREF(sep);
6353 return res;
6354 }
6355 else {
6356 PyErr_Format(PyExc_TypeError,
6357 "%s arg must be None, unicode or str",
6358 STRIPNAME(striptype));
6359 return NULL;
6360 }
6361 }
6362
6363 return do_strip(self, striptype);
6364}
6365
6366
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006367PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006368"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006369\n\
6370Return a copy of the string S with leading and trailing\n\
6371whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006372If chars is given and not None, remove characters in chars instead.\n\
6373If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006374
6375static PyObject *
6376unicode_strip(PyUnicodeObject *self, PyObject *args)
6377{
6378 if (PyTuple_GET_SIZE(args) == 0)
6379 return do_strip(self, BOTHSTRIP); /* Common case */
6380 else
6381 return do_argstrip(self, BOTHSTRIP, args);
6382}
6383
6384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006385PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006386"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006387\n\
6388Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006389If chars is given and not None, remove characters in chars instead.\n\
6390If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006391
6392static PyObject *
6393unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6394{
6395 if (PyTuple_GET_SIZE(args) == 0)
6396 return do_strip(self, LEFTSTRIP); /* Common case */
6397 else
6398 return do_argstrip(self, LEFTSTRIP, args);
6399}
6400
6401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006402PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006403"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006404\n\
6405Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006406If chars is given and not None, remove characters in chars instead.\n\
6407If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006408
6409static PyObject *
6410unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6411{
6412 if (PyTuple_GET_SIZE(args) == 0)
6413 return do_strip(self, RIGHTSTRIP); /* Common case */
6414 else
6415 return do_argstrip(self, RIGHTSTRIP, args);
6416}
6417
6418
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006420unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421{
6422 PyUnicodeObject *u;
6423 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006424 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006425 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426
6427 if (len < 0)
6428 len = 0;
6429
Tim Peters7a29bd52001-09-12 03:03:31 +00006430 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 /* no repeat, return original string */
6432 Py_INCREF(str);
6433 return (PyObject*) str;
6434 }
Tim Peters8f422462000-09-09 06:13:41 +00006435
6436 /* ensure # of chars needed doesn't overflow int and # of bytes
6437 * needed doesn't overflow size_t
6438 */
6439 nchars = len * str->length;
6440 if (len && nchars / len != str->length) {
6441 PyErr_SetString(PyExc_OverflowError,
6442 "repeated string is too long");
6443 return NULL;
6444 }
6445 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6446 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6447 PyErr_SetString(PyExc_OverflowError,
6448 "repeated string is too long");
6449 return NULL;
6450 }
6451 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 if (!u)
6453 return NULL;
6454
6455 p = u->str;
6456
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006457 if (str->length == 1 && len > 0) {
6458 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006459 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006460 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006461 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006462 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006463 done = str->length;
6464 }
6465 while (done < nchars) {
6466 int n = (done <= nchars-done) ? done : nchars-done;
6467 Py_UNICODE_COPY(p+done, p, n);
6468 done += n;
6469 }
6470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
6472 return (PyObject*) u;
6473}
6474
6475PyObject *PyUnicode_Replace(PyObject *obj,
6476 PyObject *subobj,
6477 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006478 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479{
6480 PyObject *self;
6481 PyObject *str1;
6482 PyObject *str2;
6483 PyObject *result;
6484
6485 self = PyUnicode_FromObject(obj);
6486 if (self == NULL)
6487 return NULL;
6488 str1 = PyUnicode_FromObject(subobj);
6489 if (str1 == NULL) {
6490 Py_DECREF(self);
6491 return NULL;
6492 }
6493 str2 = PyUnicode_FromObject(replobj);
6494 if (str2 == NULL) {
6495 Py_DECREF(self);
6496 Py_DECREF(str1);
6497 return NULL;
6498 }
Tim Petersced69f82003-09-16 20:30:58 +00006499 result = replace((PyUnicodeObject *)self,
6500 (PyUnicodeObject *)str1,
6501 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 maxcount);
6503 Py_DECREF(self);
6504 Py_DECREF(str1);
6505 Py_DECREF(str2);
6506 return result;
6507}
6508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006509PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510"S.replace (old, new[, maxsplit]) -> unicode\n\
6511\n\
6512Return a copy of S with all occurrences of substring\n\
6513old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006514given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515
6516static PyObject*
6517unicode_replace(PyUnicodeObject *self, PyObject *args)
6518{
6519 PyUnicodeObject *str1;
6520 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006521 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 PyObject *result;
6523
Martin v. Löwis18e16552006-02-15 17:27:45 +00006524 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 return NULL;
6526 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6527 if (str1 == NULL)
6528 return NULL;
6529 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006530 if (str2 == NULL) {
6531 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534
6535 result = replace(self, str1, str2, maxcount);
6536
6537 Py_DECREF(str1);
6538 Py_DECREF(str2);
6539 return result;
6540}
6541
6542static
6543PyObject *unicode_repr(PyObject *unicode)
6544{
6545 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6546 PyUnicode_GET_SIZE(unicode),
6547 1);
6548}
6549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006550PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551"S.rfind(sub [,start [,end]]) -> int\n\
6552\n\
6553Return the highest index in S where substring sub is found,\n\
Georg Brandlb4d100c2007-07-29 17:37:22 +00006554such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555arguments start and end are interpreted as in slice notation.\n\
6556\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006557Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558
6559static PyObject *
6560unicode_rfind(PyUnicodeObject *self, PyObject *args)
6561{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006562 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006563 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006564 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006565 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566
Guido van Rossumb8872e62000-05-09 14:14:27 +00006567 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6568 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006570 substring = PyUnicode_FromObject(substring);
6571 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 return NULL;
6573
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006574 result = stringlib_rfind_slice(
6575 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6576 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6577 start, end
6578 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579
6580 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006581
6582 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583}
6584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006585PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586"S.rindex(sub [,start [,end]]) -> int\n\
6587\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006588Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589
6590static PyObject *
6591unicode_rindex(PyUnicodeObject *self, PyObject *args)
6592{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006593 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006594 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006595 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006596 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597
Guido van Rossumb8872e62000-05-09 14:14:27 +00006598 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6599 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006601 substring = PyUnicode_FromObject(substring);
6602 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 return NULL;
6604
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006605 result = stringlib_rfind_slice(
6606 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6607 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6608 start, end
6609 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610
6611 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006612
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 if (result < 0) {
6614 PyErr_SetString(PyExc_ValueError, "substring not found");
6615 return NULL;
6616 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006617 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618}
6619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006620PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006621"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622\n\
6623Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006624done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625
6626static PyObject *
6627unicode_rjust(PyUnicodeObject *self, PyObject *args)
6628{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006629 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006630 Py_UNICODE fillchar = ' ';
6631
Martin v. Löwis412fb672006-04-13 06:34:32 +00006632 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 return NULL;
6634
Tim Peters7a29bd52001-09-12 03:03:31 +00006635 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 Py_INCREF(self);
6637 return (PyObject*) self;
6638 }
6639
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006640 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641}
6642
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006644unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645{
6646 /* standard clamping */
6647 if (start < 0)
6648 start = 0;
6649 if (end < 0)
6650 end = 0;
6651 if (end > self->length)
6652 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006653 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654 /* full slice, return original string */
6655 Py_INCREF(self);
6656 return (PyObject*) self;
6657 }
6658 if (start > end)
6659 start = end;
6660 /* copy slice */
6661 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6662 end - start);
6663}
6664
6665PyObject *PyUnicode_Split(PyObject *s,
6666 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006667 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668{
6669 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006670
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 s = PyUnicode_FromObject(s);
6672 if (s == NULL)
6673 return NULL;
6674 if (sep != NULL) {
6675 sep = PyUnicode_FromObject(sep);
6676 if (sep == NULL) {
6677 Py_DECREF(s);
6678 return NULL;
6679 }
6680 }
6681
6682 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6683
6684 Py_DECREF(s);
6685 Py_XDECREF(sep);
6686 return result;
6687}
6688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006689PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690"S.split([sep [,maxsplit]]) -> list of strings\n\
6691\n\
6692Return a list of the words in S, using sep as the\n\
6693delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006694splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006695any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696
6697static PyObject*
6698unicode_split(PyUnicodeObject *self, PyObject *args)
6699{
6700 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006701 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702
Martin v. Löwis18e16552006-02-15 17:27:45 +00006703 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 return NULL;
6705
6706 if (substring == Py_None)
6707 return split(self, NULL, maxcount);
6708 else if (PyUnicode_Check(substring))
6709 return split(self, (PyUnicodeObject *)substring, maxcount);
6710 else
6711 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6712}
6713
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006714PyObject *
6715PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6716{
6717 PyObject* str_obj;
6718 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006719 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006720
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006721 str_obj = PyUnicode_FromObject(str_in);
6722 if (!str_obj)
6723 return NULL;
6724 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006725 if (!sep_obj) {
6726 Py_DECREF(str_obj);
6727 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006728 }
6729
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006730 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006731 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6732 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6733 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006734
Fredrik Lundhb9479482006-05-26 17:22:38 +00006735 Py_DECREF(sep_obj);
6736 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006737
6738 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006739}
6740
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006741
6742PyObject *
6743PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6744{
6745 PyObject* str_obj;
6746 PyObject* sep_obj;
6747 PyObject* out;
6748
6749 str_obj = PyUnicode_FromObject(str_in);
6750 if (!str_obj)
6751 return NULL;
6752 sep_obj = PyUnicode_FromObject(sep_in);
6753 if (!sep_obj) {
6754 Py_DECREF(str_obj);
6755 return NULL;
6756 }
6757
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006758 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006759 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6760 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6761 );
6762
6763 Py_DECREF(sep_obj);
6764 Py_DECREF(str_obj);
6765
6766 return out;
6767}
6768
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006769PyDoc_STRVAR(partition__doc__,
6770"S.partition(sep) -> (head, sep, tail)\n\
6771\n\
6772Searches for the separator sep in S, and returns the part before it,\n\
6773the separator itself, and the part after it. If the separator is not\n\
6774found, returns S and two empty strings.");
6775
6776static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006777unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006778{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006779 return PyUnicode_Partition((PyObject *)self, separator);
6780}
6781
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006782PyDoc_STRVAR(rpartition__doc__,
Neal Norwitz29a5fdb2006-09-05 02:21:38 +00006783"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006784\n\
6785Searches for the separator sep in S, starting at the end of S, and returns\n\
6786the part before it, the separator itself, and the part after it. If the\n\
Neal Norwitz29a5fdb2006-09-05 02:21:38 +00006787separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006788
6789static PyObject*
6790unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6791{
6792 return PyUnicode_RPartition((PyObject *)self, separator);
6793}
6794
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006795PyObject *PyUnicode_RSplit(PyObject *s,
6796 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006797 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006798{
6799 PyObject *result;
6800
6801 s = PyUnicode_FromObject(s);
6802 if (s == NULL)
6803 return NULL;
6804 if (sep != NULL) {
6805 sep = PyUnicode_FromObject(sep);
6806 if (sep == NULL) {
6807 Py_DECREF(s);
6808 return NULL;
6809 }
6810 }
6811
6812 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6813
6814 Py_DECREF(s);
6815 Py_XDECREF(sep);
6816 return result;
6817}
6818
6819PyDoc_STRVAR(rsplit__doc__,
6820"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6821\n\
6822Return a list of the words in S, using sep as the\n\
6823delimiter string, starting at the end of the string and\n\
6824working to the front. If maxsplit is given, at most maxsplit\n\
6825splits are done. If sep is not specified, any whitespace string\n\
6826is a separator.");
6827
6828static PyObject*
6829unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6830{
6831 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006832 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006833
Martin v. Löwis18e16552006-02-15 17:27:45 +00006834 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006835 return NULL;
6836
6837 if (substring == Py_None)
6838 return rsplit(self, NULL, maxcount);
6839 else if (PyUnicode_Check(substring))
6840 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6841 else
6842 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6843}
6844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006845PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006846"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847\n\
6848Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006849Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006850is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851
6852static PyObject*
6853unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6854{
Guido van Rossum86662912000-04-11 15:38:46 +00006855 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856
Guido van Rossum86662912000-04-11 15:38:46 +00006857 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 return NULL;
6859
Guido van Rossum86662912000-04-11 15:38:46 +00006860 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861}
6862
6863static
6864PyObject *unicode_str(PyUnicodeObject *self)
6865{
Fred Drakee4315f52000-05-09 19:53:39 +00006866 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867}
6868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006869PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870"S.swapcase() -> unicode\n\
6871\n\
6872Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006873and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874
6875static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006876unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 return fixup(self, fixswapcase);
6879}
6880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006881PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882"S.translate(table) -> unicode\n\
6883\n\
6884Return a copy of the string S, where all characters have been mapped\n\
6885through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006886Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6887Unmapped characters are left untouched. Characters mapped to None\n\
6888are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889
6890static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006891unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892{
Tim Petersced69f82003-09-16 20:30:58 +00006893 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006895 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 "ignore");
6897}
6898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006899PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900"S.upper() -> unicode\n\
6901\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006902Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903
6904static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006905unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 return fixup(self, fixupper);
6908}
6909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006910PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911"S.zfill(width) -> unicode\n\
6912\n\
6913Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006914of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915
6916static PyObject *
6917unicode_zfill(PyUnicodeObject *self, PyObject *args)
6918{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006919 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920 PyUnicodeObject *u;
6921
Martin v. Löwis18e16552006-02-15 17:27:45 +00006922 Py_ssize_t width;
6923 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 return NULL;
6925
6926 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006927 if (PyUnicode_CheckExact(self)) {
6928 Py_INCREF(self);
6929 return (PyObject*) self;
6930 }
6931 else
6932 return PyUnicode_FromUnicode(
6933 PyUnicode_AS_UNICODE(self),
6934 PyUnicode_GET_SIZE(self)
6935 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 }
6937
6938 fill = width - self->length;
6939
6940 u = pad(self, fill, 0, '0');
6941
Walter Dörwald068325e2002-04-15 13:36:47 +00006942 if (u == NULL)
6943 return NULL;
6944
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 if (u->str[fill] == '+' || u->str[fill] == '-') {
6946 /* move sign to beginning of string */
6947 u->str[0] = u->str[fill];
6948 u->str[fill] = '0';
6949 }
6950
6951 return (PyObject*) u;
6952}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953
6954#if 0
6955static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006956unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 return PyInt_FromLong(unicode_freelist_size);
6959}
6960#endif
6961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006962PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006963"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006965Return True if S starts with the specified prefix, False otherwise.\n\
6966With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006967With optional end, stop comparing S at that position.\n\
6968prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969
6970static PyObject *
6971unicode_startswith(PyUnicodeObject *self,
6972 PyObject *args)
6973{
Georg Brandl24250812006-06-09 18:45:48 +00006974 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006976 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006977 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00006978 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979
Georg Brandl24250812006-06-09 18:45:48 +00006980 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00006981 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00006983 if (PyTuple_Check(subobj)) {
6984 Py_ssize_t i;
6985 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6986 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6987 PyTuple_GET_ITEM(subobj, i));
6988 if (substring == NULL)
6989 return NULL;
6990 result = tailmatch(self, substring, start, end, -1);
6991 Py_DECREF(substring);
6992 if (result) {
6993 Py_RETURN_TRUE;
6994 }
6995 }
6996 /* nothing matched */
6997 Py_RETURN_FALSE;
6998 }
6999 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007001 return NULL;
7002 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007004 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005}
7006
7007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007008PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007009"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007011Return True if S ends with the specified suffix, False otherwise.\n\
7012With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007013With optional end, stop comparing S at that position.\n\
7014suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015
7016static PyObject *
7017unicode_endswith(PyUnicodeObject *self,
7018 PyObject *args)
7019{
Georg Brandl24250812006-06-09 18:45:48 +00007020 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007022 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007023 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007024 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025
Georg Brandl24250812006-06-09 18:45:48 +00007026 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7027 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007029 if (PyTuple_Check(subobj)) {
7030 Py_ssize_t i;
7031 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7032 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7033 PyTuple_GET_ITEM(subobj, i));
7034 if (substring == NULL)
7035 return NULL;
7036 result = tailmatch(self, substring, start, end, +1);
7037 Py_DECREF(substring);
7038 if (result) {
7039 Py_RETURN_TRUE;
7040 }
7041 }
7042 Py_RETURN_FALSE;
7043 }
7044 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007046 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047
Georg Brandl24250812006-06-09 18:45:48 +00007048 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007050 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051}
7052
7053
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007054
7055static PyObject *
7056unicode_getnewargs(PyUnicodeObject *v)
7057{
7058 return Py_BuildValue("(u#)", v->str, v->length);
7059}
7060
7061
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062static PyMethodDef unicode_methods[] = {
7063
7064 /* Order is according to common usage: often used methods should
7065 appear first, since lookup is done sequentially. */
7066
Georg Brandlecdc0a92006-03-30 12:19:07 +00007067 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007068 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7069 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007070 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007071 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7072 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7073 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7074 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7075 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7076 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7077 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007078 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007079 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7080 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7081 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007082 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007083 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007084/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7085 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7086 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7087 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007088 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007089 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007090 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007091 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007092 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7093 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7094 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7095 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7096 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7097 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7098 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7099 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7100 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7101 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7102 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7103 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7104 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7105 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007106 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007107#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007108 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109#endif
7110
7111#if 0
7112 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007113 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114#endif
7115
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007116 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 {NULL, NULL}
7118};
7119
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007120static PyObject *
7121unicode_mod(PyObject *v, PyObject *w)
7122{
7123 if (!PyUnicode_Check(v)) {
7124 Py_INCREF(Py_NotImplemented);
7125 return Py_NotImplemented;
7126 }
7127 return PyUnicode_Format(v, w);
7128}
7129
7130static PyNumberMethods unicode_as_number = {
7131 0, /*nb_add*/
7132 0, /*nb_subtract*/
7133 0, /*nb_multiply*/
7134 0, /*nb_divide*/
7135 unicode_mod, /*nb_remainder*/
7136};
7137
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007139 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007140 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007141 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7142 (ssizeargfunc) unicode_getitem, /* sq_item */
7143 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144 0, /* sq_ass_item */
7145 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007146 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147};
7148
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007149static PyObject*
7150unicode_subscript(PyUnicodeObject* self, PyObject* item)
7151{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007152 if (PyIndex_Check(item)) {
7153 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007154 if (i == -1 && PyErr_Occurred())
7155 return NULL;
7156 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007157 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007158 return unicode_getitem(self, i);
7159 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007160 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007161 Py_UNICODE* source_buf;
7162 Py_UNICODE* result_buf;
7163 PyObject* result;
7164
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007165 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007166 &start, &stop, &step, &slicelength) < 0) {
7167 return NULL;
7168 }
7169
7170 if (slicelength <= 0) {
7171 return PyUnicode_FromUnicode(NULL, 0);
7172 } else {
7173 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitzb3635f92008-03-18 04:17:36 +00007174 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7175 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007176
7177 if (result_buf == NULL)
7178 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007179
7180 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7181 result_buf[i] = source_buf[cur];
7182 }
Tim Petersced69f82003-09-16 20:30:58 +00007183
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007184 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitzb3635f92008-03-18 04:17:36 +00007185 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007186 return result;
7187 }
7188 } else {
7189 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7190 return NULL;
7191 }
7192}
7193
7194static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007195 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007196 (binaryfunc)unicode_subscript, /* mp_subscript */
7197 (objobjargproc)0, /* mp_ass_subscript */
7198};
7199
Martin v. Löwis18e16552006-02-15 17:27:45 +00007200static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007202 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 const void **ptr)
7204{
7205 if (index != 0) {
7206 PyErr_SetString(PyExc_SystemError,
7207 "accessing non-existent unicode segment");
7208 return -1;
7209 }
7210 *ptr = (void *) self->str;
7211 return PyUnicode_GET_DATA_SIZE(self);
7212}
7213
Martin v. Löwis18e16552006-02-15 17:27:45 +00007214static Py_ssize_t
7215unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216 const void **ptr)
7217{
7218 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007219 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 return -1;
7221}
7222
7223static int
7224unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007225 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226{
7227 if (lenp)
7228 *lenp = PyUnicode_GET_DATA_SIZE(self);
7229 return 1;
7230}
7231
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007232static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007234 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235 const void **ptr)
7236{
7237 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007238
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 if (index != 0) {
7240 PyErr_SetString(PyExc_SystemError,
7241 "accessing non-existent unicode segment");
7242 return -1;
7243 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007244 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 if (str == NULL)
7246 return -1;
7247 *ptr = (void *) PyString_AS_STRING(str);
7248 return PyString_GET_SIZE(str);
7249}
7250
7251/* Helpers for PyUnicode_Format() */
7252
7253static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007254getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007256 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 if (argidx < arglen) {
7258 (*p_argidx)++;
7259 if (arglen < 0)
7260 return args;
7261 else
7262 return PyTuple_GetItem(args, argidx);
7263 }
7264 PyErr_SetString(PyExc_TypeError,
7265 "not enough arguments for format string");
7266 return NULL;
7267}
7268
7269#define F_LJUST (1<<0)
7270#define F_SIGN (1<<1)
7271#define F_BLANK (1<<2)
7272#define F_ALT (1<<3)
7273#define F_ZERO (1<<4)
7274
Martin v. Löwis18e16552006-02-15 17:27:45 +00007275static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007276strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007278 register Py_ssize_t i;
7279 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280 for (i = len - 1; i >= 0; i--)
7281 buffer[i] = (Py_UNICODE) charbuffer[i];
7282
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 return len;
7284}
7285
Neal Norwitzfc76d632006-01-10 06:03:13 +00007286static int
7287doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7288{
Tim Peters15231542006-02-16 01:08:01 +00007289 Py_ssize_t result;
7290
Neal Norwitzfc76d632006-01-10 06:03:13 +00007291 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007292 result = strtounicode(buffer, (char *)buffer);
7293 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007294}
7295
7296static int
7297longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7298{
Tim Peters15231542006-02-16 01:08:01 +00007299 Py_ssize_t result;
7300
Neal Norwitzfc76d632006-01-10 06:03:13 +00007301 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007302 result = strtounicode(buffer, (char *)buffer);
7303 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007304}
7305
Guido van Rossum078151d2002-08-11 04:24:12 +00007306/* XXX To save some code duplication, formatfloat/long/int could have been
7307 shared with stringobject.c, converting from 8-bit to Unicode after the
7308 formatting is done. */
7309
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310static int
7311formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007312 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 int flags,
7314 int prec,
7315 int type,
7316 PyObject *v)
7317{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007318 /* fmt = '%#.' + `prec` + `type`
7319 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320 char fmt[20];
7321 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007322
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323 x = PyFloat_AsDouble(v);
7324 if (x == -1.0 && PyErr_Occurred())
7325 return -1;
7326 if (prec < 0)
7327 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7329 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007330 /* Worst case length calc to ensure no buffer overrun:
7331
7332 'g' formats:
7333 fmt = %#.<prec>g
7334 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7335 for any double rep.)
7336 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7337
7338 'f' formats:
7339 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7340 len = 1 + 50 + 1 + prec = 52 + prec
7341
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007342 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007343 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007344
7345 */
Georg Brandlc5db9232007-07-12 08:38:04 +00007346 if (((type == 'g' || type == 'G') &&
7347 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007348 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007349 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007350 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007351 return -1;
7352 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007353 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7354 (flags&F_ALT) ? "#" : "",
7355 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007356 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357}
7358
Tim Peters38fd5b62000-09-21 05:43:11 +00007359static PyObject*
7360formatlong(PyObject *val, int flags, int prec, int type)
7361{
7362 char *buf;
7363 int i, len;
7364 PyObject *str; /* temporary string object. */
7365 PyUnicodeObject *result;
7366
7367 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7368 if (!str)
7369 return NULL;
7370 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007371 if (!result) {
7372 Py_DECREF(str);
7373 return NULL;
7374 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007375 for (i = 0; i < len; i++)
7376 result->str[i] = buf[i];
7377 result->str[len] = 0;
7378 Py_DECREF(str);
7379 return (PyObject*)result;
7380}
7381
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382static int
7383formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007384 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385 int flags,
7386 int prec,
7387 int type,
7388 PyObject *v)
7389{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007390 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007391 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7392 * + 1 + 1
7393 * = 24
7394 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007395 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007396 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397 long x;
7398
7399 x = PyInt_AsLong(v);
7400 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007401 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007402 if (x < 0 && type == 'u') {
7403 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007404 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007405 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7406 sign = "-";
7407 else
7408 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007410 prec = 1;
7411
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007412 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7413 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007414 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007415 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007416 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007417 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007418 return -1;
7419 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007420
7421 if ((flags & F_ALT) &&
7422 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007423 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007424 * of issues that cause pain:
7425 * - when 0 is being converted, the C standard leaves off
7426 * the '0x' or '0X', which is inconsistent with other
7427 * %#x/%#X conversions and inconsistent with Python's
7428 * hex() function
7429 * - there are platforms that violate the standard and
7430 * convert 0 with the '0x' or '0X'
7431 * (Metrowerks, Compaq Tru64)
7432 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007433 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007434 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007435 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007436 * We can achieve the desired consistency by inserting our
7437 * own '0x' or '0X' prefix, and substituting %x/%X in place
7438 * of %#x/%#X.
7439 *
7440 * Note that this is the same approach as used in
7441 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007442 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007443 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7444 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007445 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007446 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007447 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7448 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007449 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007450 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007451 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007452 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007453 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007454 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455}
7456
7457static int
7458formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007459 size_t buflen,
7460 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007462 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007463 if (PyUnicode_Check(v)) {
7464 if (PyUnicode_GET_SIZE(v) != 1)
7465 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007469 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007470 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007471 goto onError;
7472 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7473 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474
7475 else {
7476 /* Integer input truncated to a character */
7477 long x;
7478 x = PyInt_AsLong(v);
7479 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007480 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007481#ifdef Py_UNICODE_WIDE
7482 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007483 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007484 "%c arg not in range(0x110000) "
7485 "(wide Python build)");
7486 return -1;
7487 }
7488#else
7489 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007490 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007491 "%c arg not in range(0x10000) "
7492 "(narrow Python build)");
7493 return -1;
7494 }
7495#endif
7496 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497 }
7498 buf[1] = '\0';
7499 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007500
7501 onError:
7502 PyErr_SetString(PyExc_TypeError,
7503 "%c requires int or char");
7504 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505}
7506
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007507/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7508
7509 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7510 chars are formatted. XXX This is a magic number. Each formatting
7511 routine does bounds checking to ensure no overflow, but a better
7512 solution may be to malloc a buffer of appropriate size for each
7513 format. For now, the current solution is sufficient.
7514*/
7515#define FORMATBUFLEN (size_t)120
7516
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517PyObject *PyUnicode_Format(PyObject *format,
7518 PyObject *args)
7519{
7520 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007521 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522 int args_owned = 0;
7523 PyUnicodeObject *result = NULL;
7524 PyObject *dict = NULL;
7525 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007526
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527 if (format == NULL || args == NULL) {
7528 PyErr_BadInternalCall();
7529 return NULL;
7530 }
7531 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007532 if (uformat == NULL)
7533 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534 fmt = PyUnicode_AS_UNICODE(uformat);
7535 fmtcnt = PyUnicode_GET_SIZE(uformat);
7536
7537 reslen = rescnt = fmtcnt + 100;
7538 result = _PyUnicode_New(reslen);
7539 if (result == NULL)
7540 goto onError;
7541 res = PyUnicode_AS_UNICODE(result);
7542
7543 if (PyTuple_Check(args)) {
7544 arglen = PyTuple_Size(args);
7545 argidx = 0;
7546 }
7547 else {
7548 arglen = -1;
7549 argidx = -2;
7550 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007551 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7552 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553 dict = args;
7554
7555 while (--fmtcnt >= 0) {
7556 if (*fmt != '%') {
7557 if (--rescnt < 0) {
7558 rescnt = fmtcnt + 100;
7559 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007560 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007561 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7563 --rescnt;
7564 }
7565 *res++ = *fmt++;
7566 }
7567 else {
7568 /* Got a format specifier */
7569 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007570 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 Py_UNICODE c = '\0';
7573 Py_UNICODE fill;
7574 PyObject *v = NULL;
7575 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007576 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007578 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007579 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580
7581 fmt++;
7582 if (*fmt == '(') {
7583 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007584 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585 PyObject *key;
7586 int pcount = 1;
7587
7588 if (dict == NULL) {
7589 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007590 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 goto onError;
7592 }
7593 ++fmt;
7594 --fmtcnt;
7595 keystart = fmt;
7596 /* Skip over balanced parentheses */
7597 while (pcount > 0 && --fmtcnt >= 0) {
7598 if (*fmt == ')')
7599 --pcount;
7600 else if (*fmt == '(')
7601 ++pcount;
7602 fmt++;
7603 }
7604 keylen = fmt - keystart - 1;
7605 if (fmtcnt < 0 || pcount > 0) {
7606 PyErr_SetString(PyExc_ValueError,
7607 "incomplete format key");
7608 goto onError;
7609 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007610#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007611 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 then looked up since Python uses strings to hold
7613 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007614 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615 key = PyUnicode_EncodeUTF8(keystart,
7616 keylen,
7617 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007618#else
7619 key = PyUnicode_FromUnicode(keystart, keylen);
7620#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621 if (key == NULL)
7622 goto onError;
7623 if (args_owned) {
7624 Py_DECREF(args);
7625 args_owned = 0;
7626 }
7627 args = PyObject_GetItem(dict, key);
7628 Py_DECREF(key);
7629 if (args == NULL) {
7630 goto onError;
7631 }
7632 args_owned = 1;
7633 arglen = -1;
7634 argidx = -2;
7635 }
7636 while (--fmtcnt >= 0) {
7637 switch (c = *fmt++) {
7638 case '-': flags |= F_LJUST; continue;
7639 case '+': flags |= F_SIGN; continue;
7640 case ' ': flags |= F_BLANK; continue;
7641 case '#': flags |= F_ALT; continue;
7642 case '0': flags |= F_ZERO; continue;
7643 }
7644 break;
7645 }
7646 if (c == '*') {
7647 v = getnextarg(args, arglen, &argidx);
7648 if (v == NULL)
7649 goto onError;
7650 if (!PyInt_Check(v)) {
7651 PyErr_SetString(PyExc_TypeError,
7652 "* wants int");
7653 goto onError;
7654 }
7655 width = PyInt_AsLong(v);
7656 if (width < 0) {
7657 flags |= F_LJUST;
7658 width = -width;
7659 }
7660 if (--fmtcnt >= 0)
7661 c = *fmt++;
7662 }
7663 else if (c >= '0' && c <= '9') {
7664 width = c - '0';
7665 while (--fmtcnt >= 0) {
7666 c = *fmt++;
7667 if (c < '0' || c > '9')
7668 break;
7669 if ((width*10) / 10 != width) {
7670 PyErr_SetString(PyExc_ValueError,
7671 "width too big");
7672 goto onError;
7673 }
7674 width = width*10 + (c - '0');
7675 }
7676 }
7677 if (c == '.') {
7678 prec = 0;
7679 if (--fmtcnt >= 0)
7680 c = *fmt++;
7681 if (c == '*') {
7682 v = getnextarg(args, arglen, &argidx);
7683 if (v == NULL)
7684 goto onError;
7685 if (!PyInt_Check(v)) {
7686 PyErr_SetString(PyExc_TypeError,
7687 "* wants int");
7688 goto onError;
7689 }
7690 prec = PyInt_AsLong(v);
7691 if (prec < 0)
7692 prec = 0;
7693 if (--fmtcnt >= 0)
7694 c = *fmt++;
7695 }
7696 else if (c >= '0' && c <= '9') {
7697 prec = c - '0';
7698 while (--fmtcnt >= 0) {
7699 c = Py_CHARMASK(*fmt++);
7700 if (c < '0' || c > '9')
7701 break;
7702 if ((prec*10) / 10 != prec) {
7703 PyErr_SetString(PyExc_ValueError,
7704 "prec too big");
7705 goto onError;
7706 }
7707 prec = prec*10 + (c - '0');
7708 }
7709 }
7710 } /* prec */
7711 if (fmtcnt >= 0) {
7712 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713 if (--fmtcnt >= 0)
7714 c = *fmt++;
7715 }
7716 }
7717 if (fmtcnt < 0) {
7718 PyErr_SetString(PyExc_ValueError,
7719 "incomplete format");
7720 goto onError;
7721 }
7722 if (c != '%') {
7723 v = getnextarg(args, arglen, &argidx);
7724 if (v == NULL)
7725 goto onError;
7726 }
7727 sign = 0;
7728 fill = ' ';
7729 switch (c) {
7730
7731 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007732 pbuf = formatbuf;
7733 /* presume that buffer length is at least 1 */
7734 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 len = 1;
7736 break;
7737
7738 case 's':
7739 case 'r':
7740 if (PyUnicode_Check(v) && c == 's') {
7741 temp = v;
7742 Py_INCREF(temp);
7743 }
7744 else {
7745 PyObject *unicode;
7746 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007747 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 else
7749 temp = PyObject_Repr(v);
7750 if (temp == NULL)
7751 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007752 if (PyUnicode_Check(temp))
7753 /* nothing to do */;
7754 else if (PyString_Check(temp)) {
7755 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007756 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007758 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007760 Py_DECREF(temp);
7761 temp = unicode;
7762 if (temp == NULL)
7763 goto onError;
7764 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007765 else {
7766 Py_DECREF(temp);
7767 PyErr_SetString(PyExc_TypeError,
7768 "%s argument has non-string str()");
7769 goto onError;
7770 }
7771 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007772 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773 len = PyUnicode_GET_SIZE(temp);
7774 if (prec >= 0 && len > prec)
7775 len = prec;
7776 break;
7777
7778 case 'i':
7779 case 'd':
7780 case 'u':
7781 case 'o':
7782 case 'x':
7783 case 'X':
7784 if (c == 'i')
7785 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007786 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007787 temp = formatlong(v, flags, prec, c);
7788 if (!temp)
7789 goto onError;
7790 pbuf = PyUnicode_AS_UNICODE(temp);
7791 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007792 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007794 else {
7795 pbuf = formatbuf;
7796 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7797 flags, prec, c, v);
7798 if (len < 0)
7799 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007800 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007801 }
7802 if (flags & F_ZERO)
7803 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 break;
7805
7806 case 'e':
7807 case 'E':
7808 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007809 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 case 'g':
7811 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007812 if (c == 'F')
7813 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007814 pbuf = formatbuf;
7815 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7816 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817 if (len < 0)
7818 goto onError;
7819 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007820 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 fill = '0';
7822 break;
7823
7824 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007825 pbuf = formatbuf;
7826 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827 if (len < 0)
7828 goto onError;
7829 break;
7830
7831 default:
7832 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007833 "unsupported format character '%c' (0x%x) "
Armin Rigo4b63c212006-10-04 11:44:06 +00007834 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007835 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007836 (int)c,
Armin Rigo4b63c212006-10-04 11:44:06 +00007837 (Py_ssize_t)(fmt - 1 -
7838 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 goto onError;
7840 }
7841 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007842 if (*pbuf == '-' || *pbuf == '+') {
7843 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 len--;
7845 }
7846 else if (flags & F_SIGN)
7847 sign = '+';
7848 else if (flags & F_BLANK)
7849 sign = ' ';
7850 else
7851 sign = 0;
7852 }
7853 if (width < len)
7854 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007855 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 reslen -= rescnt;
7857 rescnt = width + fmtcnt + 100;
7858 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007859 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007860 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007861 PyErr_NoMemory();
7862 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007863 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007864 if (_PyUnicode_Resize(&result, reslen) < 0) {
7865 Py_XDECREF(temp);
7866 goto onError;
7867 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 res = PyUnicode_AS_UNICODE(result)
7869 + reslen - rescnt;
7870 }
7871 if (sign) {
7872 if (fill != ' ')
7873 *res++ = sign;
7874 rescnt--;
7875 if (width > len)
7876 width--;
7877 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007878 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7879 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007880 assert(pbuf[1] == c);
7881 if (fill != ' ') {
7882 *res++ = *pbuf++;
7883 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007884 }
Tim Petersfff53252001-04-12 18:38:48 +00007885 rescnt -= 2;
7886 width -= 2;
7887 if (width < 0)
7888 width = 0;
7889 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007890 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891 if (width > len && !(flags & F_LJUST)) {
7892 do {
7893 --rescnt;
7894 *res++ = fill;
7895 } while (--width > len);
7896 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007897 if (fill == ' ') {
7898 if (sign)
7899 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007900 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007901 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007902 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007903 *res++ = *pbuf++;
7904 *res++ = *pbuf++;
7905 }
7906 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007907 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908 res += len;
7909 rescnt -= len;
7910 while (--width >= len) {
7911 --rescnt;
7912 *res++ = ' ';
7913 }
7914 if (dict && (argidx < arglen) && c != '%') {
7915 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007916 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007917 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918 goto onError;
7919 }
7920 Py_XDECREF(temp);
7921 } /* '%' */
7922 } /* until end */
7923 if (argidx < arglen && !dict) {
7924 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007925 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926 goto onError;
7927 }
7928
Thomas Woutersa96affe2006-03-12 00:29:36 +00007929 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7930 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007931 if (args_owned) {
7932 Py_DECREF(args);
7933 }
7934 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007935 return (PyObject *)result;
7936
7937 onError:
7938 Py_XDECREF(result);
7939 Py_DECREF(uformat);
7940 if (args_owned) {
7941 Py_DECREF(args);
7942 }
7943 return NULL;
7944}
7945
7946static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007947 (readbufferproc) unicode_buffer_getreadbuf,
7948 (writebufferproc) unicode_buffer_getwritebuf,
7949 (segcountproc) unicode_buffer_getsegcount,
7950 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951};
7952
Jeremy Hylton938ace62002-07-17 16:30:39 +00007953static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007954unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7955
Tim Peters6d6c1a32001-08-02 04:15:00 +00007956static PyObject *
7957unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7958{
7959 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007960 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007961 char *encoding = NULL;
7962 char *errors = NULL;
7963
Guido van Rossume023fe02001-08-30 03:12:59 +00007964 if (type != &PyUnicode_Type)
7965 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007966 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7967 kwlist, &x, &encoding, &errors))
7968 return NULL;
7969 if (x == NULL)
7970 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007971 if (encoding == NULL && errors == NULL)
7972 return PyObject_Unicode(x);
7973 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007974 return PyUnicode_FromEncodedObject(x, encoding, errors);
7975}
7976
Guido van Rossume023fe02001-08-30 03:12:59 +00007977static PyObject *
7978unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7979{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007980 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007981 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007982
7983 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7984 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7985 if (tmp == NULL)
7986 return NULL;
7987 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007988 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007989 if (pnew == NULL) {
7990 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007991 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007992 }
Neal Norwitzb3635f92008-03-18 04:17:36 +00007993 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007994 if (pnew->str == NULL) {
7995 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007996 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007997 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007998 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007999 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008000 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8001 pnew->length = n;
8002 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008003 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008004 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008005}
8006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008007PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008008"unicode(string [, encoding[, errors]]) -> object\n\
8009\n\
8010Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008011encoding defaults to the current default string encoding.\n\
8012errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008013
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014PyTypeObject PyUnicode_Type = {
8015 PyObject_HEAD_INIT(&PyType_Type)
8016 0, /* ob_size */
8017 "unicode", /* tp_name */
8018 sizeof(PyUnicodeObject), /* tp_size */
8019 0, /* tp_itemsize */
8020 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008021 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008023 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008025 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008026 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008027 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008029 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 (hashfunc) unicode_hash, /* tp_hash*/
8031 0, /* tp_call*/
8032 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008033 PyObject_GenericGetAttr, /* tp_getattro */
8034 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008036 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8037 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008038 unicode_doc, /* tp_doc */
8039 0, /* tp_traverse */
8040 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008041 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008042 0, /* tp_weaklistoffset */
8043 0, /* tp_iter */
8044 0, /* tp_iternext */
8045 unicode_methods, /* tp_methods */
8046 0, /* tp_members */
8047 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008048 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008049 0, /* tp_dict */
8050 0, /* tp_descr_get */
8051 0, /* tp_descr_set */
8052 0, /* tp_dictoffset */
8053 0, /* tp_init */
8054 0, /* tp_alloc */
8055 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008056 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057};
8058
8059/* Initialize the Unicode implementation */
8060
Thomas Wouters78890102000-07-22 19:25:51 +00008061void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008063 int i;
8064
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008065 /* XXX - move this array to unicodectype.c ? */
8066 Py_UNICODE linebreak[] = {
8067 0x000A, /* LINE FEED */
8068 0x000D, /* CARRIAGE RETURN */
8069 0x001C, /* FILE SEPARATOR */
8070 0x001D, /* GROUP SEPARATOR */
8071 0x001E, /* RECORD SEPARATOR */
8072 0x0085, /* NEXT LINE */
8073 0x2028, /* LINE SEPARATOR */
8074 0x2029, /* PARAGRAPH SEPARATOR */
8075 };
8076
Fred Drakee4315f52000-05-09 19:53:39 +00008077 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008078 unicode_freelist = NULL;
8079 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008081 if (!unicode_empty)
8082 return;
8083
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008084 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008085 for (i = 0; i < 256; i++)
8086 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008087 if (PyType_Ready(&PyUnicode_Type) < 0)
8088 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008089
8090 /* initialize the linebreak bloom filter */
8091 bloom_linebreak = make_bloom_mask(
8092 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8093 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008094
8095 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096}
8097
8098/* Finalize the Unicode implementation */
8099
8100void
Thomas Wouters78890102000-07-22 19:25:51 +00008101_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008103 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008104 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008106 Py_XDECREF(unicode_empty);
8107 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008109 for (i = 0; i < 256; i++) {
8110 if (unicode_latin1[i]) {
8111 Py_DECREF(unicode_latin1[i]);
8112 unicode_latin1[i] = NULL;
8113 }
8114 }
8115
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008116 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117 PyUnicodeObject *v = u;
8118 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008119 if (v->str)
Neal Norwitzb3635f92008-03-18 04:17:36 +00008120 PyObject_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008121 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008122 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008124 unicode_freelist = NULL;
8125 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008127
Anthony Baxterac6bd462006-04-13 02:06:09 +00008128#ifdef __cplusplus
8129}
8130#endif
8131
8132
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008133/*
8134Local variables:
8135c-basic-offset: 4
8136indent-tabs-mode: nil
8137End:
8138*/