blob: 4c308ccfe0a9fbe6ce8817ffea9fec2b7674a6ab [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
Neal Norwitzb3635f92008-03-18 04:17:36 +0000203 unicode->str = PyObject_REALLOC(unicode->str,
204 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000206 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 PyErr_NoMemory();
208 return -1;
209 }
210 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000211 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000213 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000215 if (unicode->defenc) {
216 Py_DECREF(unicode->defenc);
217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 }
219 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000220
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 return 0;
222}
223
224/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000225 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226
227 XXX This allocator could further be enhanced by assuring that the
228 free list never reduces its size below 1.
229
230*/
231
232static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000233PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234{
235 register PyUnicodeObject *unicode;
236
Andrew Dalkee0df7622006-05-27 11:04:36 +0000237 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238 if (length == 0 && unicode_empty != NULL) {
239 Py_INCREF(unicode_empty);
240 return unicode_empty;
241 }
242
Neal Norwitz4f3be8a2008-07-31 17:08:14 +0000243 /* Ensure we won't overflow the size. */
244 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
245 return (PyUnicodeObject *)PyErr_NoMemory();
246 }
247
Guido van Rossumd57fd912000-03-10 22:53:23 +0000248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000258 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000263 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
264 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000265 }
266 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268 else {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000269 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000270 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271 if (unicode == NULL)
272 return NULL;
Neal Norwitzb3635f92008-03-18 04:17:36 +0000273 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
274 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 }
276
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 if (!unicode->str) {
278 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000279 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000280 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000281 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000282 * the caller fails before initializing str -- unicode_resize()
283 * reads str[0], and the Keep-Alive optimization can keep memory
284 * allocated for str alive across a call to unicode_dealloc(unicode).
285 * We don't want unicode_resize to read uninitialized memory in
286 * that case.
287 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000288 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000290 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000292 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000294
295 onError:
296 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000297 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000298 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299}
300
301static
Guido van Rossum9475a232001-10-05 20:51:39 +0000302void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000303{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000304 if (PyUnicode_CheckExact(unicode) &&
305 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 /* Keep-Alive optimization */
307 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000308 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 unicode->str = NULL;
310 unicode->length = 0;
311 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000312 if (unicode->defenc) {
313 Py_DECREF(unicode->defenc);
314 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000315 }
316 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000317 *(PyUnicodeObject **)unicode = unicode_freelist;
318 unicode_freelist = unicode;
319 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 }
321 else {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000322 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000323 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000324 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000325 }
326}
327
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329{
330 register PyUnicodeObject *v;
331
332 /* Argument checks */
333 if (unicode == NULL) {
334 PyErr_BadInternalCall();
335 return -1;
336 }
337 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000338 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000339 PyErr_BadInternalCall();
340 return -1;
341 }
342
343 /* Resizing unicode_empty and single character objects is not
344 possible since these are being shared. We simply return a fresh
345 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000346 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000347 (v == unicode_empty || v->length == 1)) {
348 PyUnicodeObject *w = _PyUnicode_New(length);
349 if (w == NULL)
350 return -1;
351 Py_UNICODE_COPY(w->str, v->str,
352 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000353 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000354 *unicode = (PyObject *)w;
355 return 0;
356 }
357
358 /* Note that we don't have to modify *unicode for unshared Unicode
359 objects, since we can modify them in-place. */
360 return unicode_resize(v, length);
361}
362
363/* Internal API for use in unicodeobject.c only ! */
364#define _PyUnicode_Resize(unicodevar, length) \
365 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
366
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000368 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369{
370 PyUnicodeObject *unicode;
371
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000372 /* If the Unicode data is known at construction time, we can apply
373 some optimizations which share commonly used objects. */
374 if (u != NULL) {
375
376 /* Optimization for empty strings */
377 if (size == 0 && unicode_empty != NULL) {
378 Py_INCREF(unicode_empty);
379 return (PyObject *)unicode_empty;
380 }
381
382 /* Single character Unicode objects in the Latin-1 range are
383 shared when using this constructor */
384 if (size == 1 && *u < 256) {
385 unicode = unicode_latin1[*u];
386 if (!unicode) {
387 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000388 if (!unicode)
389 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000390 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 unicode_latin1[*u] = unicode;
392 }
393 Py_INCREF(unicode);
394 return (PyObject *)unicode;
395 }
396 }
Tim Petersced69f82003-09-16 20:30:58 +0000397
Guido van Rossumd57fd912000-03-10 22:53:23 +0000398 unicode = _PyUnicode_New(size);
399 if (!unicode)
400 return NULL;
401
402 /* Copy the Unicode data into the new object */
403 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000404 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405
406 return (PyObject *)unicode;
407}
408
409#ifdef HAVE_WCHAR_H
410
411PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000412 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413{
414 PyUnicodeObject *unicode;
415
416 if (w == NULL) {
417 PyErr_BadInternalCall();
418 return NULL;
419 }
420
421 unicode = _PyUnicode_New(size);
422 if (!unicode)
423 return NULL;
424
425 /* Copy the wchar_t data into the new object */
426#ifdef HAVE_USABLE_WCHAR_T
427 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000428#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429 {
430 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000431 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000433 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000434 *u++ = *w++;
435 }
436#endif
437
438 return (PyObject *)unicode;
439}
440
Martin v. Löwis18e16552006-02-15 17:27:45 +0000441Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
442 wchar_t *w,
443 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444{
445 if (unicode == NULL) {
446 PyErr_BadInternalCall();
447 return -1;
448 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000449
450 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000452 size = PyUnicode_GET_SIZE(unicode) + 1;
453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454#ifdef HAVE_USABLE_WCHAR_T
455 memcpy(w, unicode->str, size * sizeof(wchar_t));
456#else
457 {
458 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000459 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000461 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462 *w++ = *u++;
463 }
464#endif
465
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000466 if (size > PyUnicode_GET_SIZE(unicode))
467 return PyUnicode_GET_SIZE(unicode);
468 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 return size;
470}
471
472#endif
473
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000474PyObject *PyUnicode_FromOrdinal(int ordinal)
475{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000476 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000477
478#ifdef Py_UNICODE_WIDE
479 if (ordinal < 0 || ordinal > 0x10ffff) {
480 PyErr_SetString(PyExc_ValueError,
481 "unichr() arg not in range(0x110000) "
482 "(wide Python build)");
483 return NULL;
484 }
485#else
486 if (ordinal < 0 || ordinal > 0xffff) {
487 PyErr_SetString(PyExc_ValueError,
488 "unichr() arg not in range(0x10000) "
489 "(narrow Python build)");
490 return NULL;
491 }
492#endif
493
Hye-Shik Chang40574832004-04-06 07:24:51 +0000494 s[0] = (Py_UNICODE)ordinal;
495 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000496}
497
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498PyObject *PyUnicode_FromObject(register PyObject *obj)
499{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000500 /* XXX Perhaps we should make this API an alias of
501 PyObject_Unicode() instead ?! */
502 if (PyUnicode_CheckExact(obj)) {
503 Py_INCREF(obj);
504 return obj;
505 }
506 if (PyUnicode_Check(obj)) {
507 /* For a Unicode subtype that's not a Unicode object,
508 return a true Unicode object with the same data. */
509 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
510 PyUnicode_GET_SIZE(obj));
511 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
513}
514
515PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
516 const char *encoding,
517 const char *errors)
518{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000519 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000520 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000521 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000522
Guido van Rossumd57fd912000-03-10 22:53:23 +0000523 if (obj == NULL) {
524 PyErr_BadInternalCall();
525 return NULL;
526 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000527
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000528#if 0
529 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000530 that no encodings is given and then redirect to
531 PyObject_Unicode() which then applies the additional logic for
532 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000533
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000534 NOTE: This API should really only be used for object which
535 represent *encoded* Unicode !
536
537 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000538 if (PyUnicode_Check(obj)) {
539 if (encoding) {
540 PyErr_SetString(PyExc_TypeError,
541 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000542 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000543 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000544 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000545 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000546#else
547 if (PyUnicode_Check(obj)) {
548 PyErr_SetString(PyExc_TypeError,
549 "decoding Unicode is not supported");
550 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000551 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000552#endif
553
554 /* Coerce object */
555 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000556 s = PyString_AS_STRING(obj);
557 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000558 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000559 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
560 /* Overwrite the error message with something more useful in
561 case of a TypeError. */
562 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000563 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000564 "coercing to Unicode: need string or buffer, "
565 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000566 obj->ob_type->tp_name);
567 goto onError;
568 }
Tim Petersced69f82003-09-16 20:30:58 +0000569
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000570 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571 if (len == 0) {
572 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000573 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000574 }
Tim Petersced69f82003-09-16 20:30:58 +0000575 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000576 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000577
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000578 return v;
579
580 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000581 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582}
583
584PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000585 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586 const char *encoding,
587 const char *errors)
588{
589 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000590
591 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000592 encoding = PyUnicode_GetDefaultEncoding();
593
594 /* Shortcuts for common default encodings */
595 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000597 else if (strcmp(encoding, "latin-1") == 0)
598 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000599#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
600 else if (strcmp(encoding, "mbcs") == 0)
601 return PyUnicode_DecodeMBCS(s, size, errors);
602#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000603 else if (strcmp(encoding, "ascii") == 0)
604 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605
606 /* Decode via the codec registry */
607 buffer = PyBuffer_FromMemory((void *)s, size);
608 if (buffer == NULL)
609 goto onError;
610 unicode = PyCodec_Decode(buffer, encoding, errors);
611 if (unicode == NULL)
612 goto onError;
613 if (!PyUnicode_Check(unicode)) {
614 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000615 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000616 unicode->ob_type->tp_name);
617 Py_DECREF(unicode);
618 goto onError;
619 }
620 Py_DECREF(buffer);
621 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000622
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623 onError:
624 Py_XDECREF(buffer);
625 return NULL;
626}
627
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000628PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
629 const char *encoding,
630 const char *errors)
631{
632 PyObject *v;
633
634 if (!PyUnicode_Check(unicode)) {
635 PyErr_BadArgument();
636 goto onError;
637 }
638
639 if (encoding == NULL)
640 encoding = PyUnicode_GetDefaultEncoding();
641
642 /* Decode via the codec registry */
643 v = PyCodec_Decode(unicode, encoding, errors);
644 if (v == NULL)
645 goto onError;
646 return v;
647
648 onError:
649 return NULL;
650}
651
Guido van Rossumd57fd912000-03-10 22:53:23 +0000652PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000653 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 const char *encoding,
655 const char *errors)
656{
657 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000658
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 unicode = PyUnicode_FromUnicode(s, size);
660 if (unicode == NULL)
661 return NULL;
662 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
663 Py_DECREF(unicode);
664 return v;
665}
666
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000667PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
668 const char *encoding,
669 const char *errors)
670{
671 PyObject *v;
672
673 if (!PyUnicode_Check(unicode)) {
674 PyErr_BadArgument();
675 goto onError;
676 }
677
678 if (encoding == NULL)
679 encoding = PyUnicode_GetDefaultEncoding();
680
681 /* Encode via the codec registry */
682 v = PyCodec_Encode(unicode, encoding, errors);
683 if (v == NULL)
684 goto onError;
685 return v;
686
687 onError:
688 return NULL;
689}
690
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
692 const char *encoding,
693 const char *errors)
694{
695 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000696
Guido van Rossumd57fd912000-03-10 22:53:23 +0000697 if (!PyUnicode_Check(unicode)) {
698 PyErr_BadArgument();
699 goto onError;
700 }
Fred Drakee4315f52000-05-09 19:53:39 +0000701
Tim Petersced69f82003-09-16 20:30:58 +0000702 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000703 encoding = PyUnicode_GetDefaultEncoding();
704
705 /* Shortcuts for common default encodings */
706 if (errors == NULL) {
707 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000708 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000709 else if (strcmp(encoding, "latin-1") == 0)
710 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000711#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
712 else if (strcmp(encoding, "mbcs") == 0)
713 return PyUnicode_AsMBCSString(unicode);
714#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000715 else if (strcmp(encoding, "ascii") == 0)
716 return PyUnicode_AsASCIIString(unicode);
717 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718
719 /* Encode via the codec registry */
720 v = PyCodec_Encode(unicode, encoding, errors);
721 if (v == NULL)
722 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 if (!PyString_Check(v)) {
724 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000725 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726 v->ob_type->tp_name);
727 Py_DECREF(v);
728 goto onError;
729 }
730 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000731
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 onError:
733 return NULL;
734}
735
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000736PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
737 const char *errors)
738{
739 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
740
741 if (v)
742 return v;
743 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
744 if (v && errors == NULL)
745 ((PyUnicodeObject *)unicode)->defenc = v;
746 return v;
747}
748
Guido van Rossumd57fd912000-03-10 22:53:23 +0000749Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
750{
751 if (!PyUnicode_Check(unicode)) {
752 PyErr_BadArgument();
753 goto onError;
754 }
755 return PyUnicode_AS_UNICODE(unicode);
756
757 onError:
758 return NULL;
759}
760
Martin v. Löwis18e16552006-02-15 17:27:45 +0000761Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762{
763 if (!PyUnicode_Check(unicode)) {
764 PyErr_BadArgument();
765 goto onError;
766 }
767 return PyUnicode_GET_SIZE(unicode);
768
769 onError:
770 return -1;
771}
772
Thomas Wouters78890102000-07-22 19:25:51 +0000773const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000774{
775 return unicode_default_encoding;
776}
777
778int PyUnicode_SetDefaultEncoding(const char *encoding)
779{
780 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000781
Fred Drakee4315f52000-05-09 19:53:39 +0000782 /* Make sure the encoding is valid. As side effect, this also
783 loads the encoding into the codec registry cache. */
784 v = _PyCodec_Lookup(encoding);
785 if (v == NULL)
786 goto onError;
787 Py_DECREF(v);
788 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000789 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000790 sizeof(unicode_default_encoding));
791 return 0;
792
793 onError:
794 return -1;
795}
796
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000797/* error handling callback helper:
798 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000799 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800 and adjust various state variables.
801 return 0 on success, -1 on error
802*/
803
804static
805int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
806 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000807 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
808 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000809{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000810 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000811
812 PyObject *restuple = NULL;
813 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000814 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
815 Py_ssize_t requiredsize;
816 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000817 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000818 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000819 int res = -1;
820
821 if (*errorHandler == NULL) {
822 *errorHandler = PyCodec_LookupError(errors);
823 if (*errorHandler == NULL)
824 goto onError;
825 }
826
827 if (*exceptionObject == NULL) {
828 *exceptionObject = PyUnicodeDecodeError_Create(
829 encoding, input, insize, *startinpos, *endinpos, reason);
830 if (*exceptionObject == NULL)
831 goto onError;
832 }
833 else {
834 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
835 goto onError;
836 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
837 goto onError;
838 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
839 goto onError;
840 }
841
842 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
843 if (restuple == NULL)
844 goto onError;
845 if (!PyTuple_Check(restuple)) {
846 PyErr_Format(PyExc_TypeError, &argparse[4]);
847 goto onError;
848 }
849 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
850 goto onError;
851 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000852 newpos = insize+newpos;
853 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000854 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000855 goto onError;
856 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000857
858 /* need more space? (at least enough for what we
859 have+the replacement+the rest of the string (starting
860 at the new input position), so we won't have to check space
861 when there are no errors in the rest of the string) */
862 repptr = PyUnicode_AS_UNICODE(repunicode);
863 repsize = PyUnicode_GET_SIZE(repunicode);
864 requiredsize = *outpos + repsize + insize-newpos;
865 if (requiredsize > outsize) {
866 if (requiredsize<2*outsize)
867 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000868 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000869 goto onError;
870 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
871 }
872 *endinpos = newpos;
873 *inptr = input + newpos;
874 Py_UNICODE_COPY(*outptr, repptr, repsize);
875 *outptr += repsize;
876 *outpos += repsize;
877 /* we made it! */
878 res = 0;
879
880 onError:
881 Py_XDECREF(restuple);
882 return res;
883}
884
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000885/* --- UTF-7 Codec -------------------------------------------------------- */
886
887/* see RFC2152 for details */
888
Tim Petersced69f82003-09-16 20:30:58 +0000889static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000890char utf7_special[128] = {
891 /* indicate whether a UTF-7 character is special i.e. cannot be directly
892 encoded:
893 0 - not special
894 1 - special
895 2 - whitespace (optional)
896 3 - RFC2152 Set O (optional) */
897 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
898 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
899 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
901 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
902 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
903 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
904 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
905
906};
907
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000908/* Note: The comparison (c) <= 0 is a trick to work-around gcc
909 warnings about the comparison always being false; since
910 utf7_special[0] is 1, we can safely make that one comparison
911 true */
912
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000913#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000914 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000915 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916 (encodeO && (utf7_special[(c)] == 3)))
917
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000918#define B64(n) \
919 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
920#define B64CHAR(c) \
921 (isalnum(c) || (c) == '+' || (c) == '/')
922#define UB64(c) \
923 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
924 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000925
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000926#define ENCODE(out, ch, bits) \
927 while (bits >= 6) { \
928 *out++ = B64(ch >> (bits-6)); \
929 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000930 }
931
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000932#define DECODE(out, ch, bits, surrogate) \
933 while (bits >= 16) { \
934 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
935 bits -= 16; \
936 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000937 /* We have already generated an error for the high surrogate \
938 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000939 surrogate = 0; \
940 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000942 it in a 16-bit character */ \
943 surrogate = 1; \
944 errmsg = "code pairs are not supported"; \
945 goto utf7Error; \
946 } else { \
947 *out++ = outCh; \
948 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000949 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000951PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000952 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000953 const char *errors)
954{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000955 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000956 Py_ssize_t startinpos;
957 Py_ssize_t endinpos;
958 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000959 const char *e;
960 PyUnicodeObject *unicode;
961 Py_UNICODE *p;
962 const char *errmsg = "";
963 int inShift = 0;
964 unsigned int bitsleft = 0;
965 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000966 int surrogate = 0;
967 PyObject *errorHandler = NULL;
968 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000969
970 unicode = _PyUnicode_New(size);
971 if (!unicode)
972 return NULL;
973 if (size == 0)
974 return (PyObject *)unicode;
975
976 p = unicode->str;
977 e = s + size;
978
979 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000980 Py_UNICODE ch;
981 restart:
Antoine Pitrouc8e4bed2008-07-25 19:00:48 +0000982 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000983
984 if (inShift) {
985 if ((ch == '-') || !B64CHAR(ch)) {
986 inShift = 0;
987 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000988
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000989 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
990 if (bitsleft >= 6) {
991 /* The shift sequence has a partial character in it. If
992 bitsleft < 6 then we could just classify it as padding
993 but that is not the case here */
994
995 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000996 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000999 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001000 here so indicate the potential of a misencoded character. */
1001
1002 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1003 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1004 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001005 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001006 }
1007
1008 if (ch == '-') {
1009 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001010 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 inShift = 1;
1012 }
1013 } else if (SPECIAL(ch,0,0)) {
1014 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001015 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001016 } else {
1017 *p++ = ch;
1018 }
1019 } else {
1020 charsleft = (charsleft << 6) | UB64(ch);
1021 bitsleft += 6;
1022 s++;
1023 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1024 }
1025 }
1026 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001027 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001028 s++;
1029 if (s < e && *s == '-') {
1030 s++;
1031 *p++ = '+';
1032 } else
1033 {
1034 inShift = 1;
1035 bitsleft = 0;
1036 }
1037 }
1038 else if (SPECIAL(ch,0,0)) {
1039 errmsg = "unexpected special character";
1040 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001041 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001042 }
1043 else {
1044 *p++ = ch;
1045 s++;
1046 }
1047 continue;
1048 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001049 outpos = p-PyUnicode_AS_UNICODE(unicode);
1050 endinpos = s-starts;
1051 if (unicode_decode_call_errorhandler(
1052 errors, &errorHandler,
1053 "utf7", errmsg,
1054 starts, size, &startinpos, &endinpos, &exc, &s,
1055 (PyObject **)&unicode, &outpos, &p))
1056 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001057 }
1058
1059 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001060 outpos = p-PyUnicode_AS_UNICODE(unicode);
1061 endinpos = size;
1062 if (unicode_decode_call_errorhandler(
1063 errors, &errorHandler,
1064 "utf7", "unterminated shift sequence",
1065 starts, size, &startinpos, &endinpos, &exc, &s,
1066 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001067 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001068 if (s < e)
1069 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001070 }
1071
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001072 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 goto onError;
1074
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001075 Py_XDECREF(errorHandler);
1076 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001077 return (PyObject *)unicode;
1078
1079onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001080 Py_XDECREF(errorHandler);
1081 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001082 Py_DECREF(unicode);
1083 return NULL;
1084}
1085
1086
1087PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 int encodeSetO,
1090 int encodeWhiteSpace,
1091 const char *errors)
1092{
1093 PyObject *v;
1094 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001095 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001096 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001097 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001098 unsigned int bitsleft = 0;
1099 unsigned long charsleft = 0;
1100 char * out;
1101 char * start;
1102
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00001103 if (cbAllocated / 5 != size)
1104 return PyErr_NoMemory();
1105
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001106 if (size == 0)
1107 return PyString_FromStringAndSize(NULL, 0);
1108
1109 v = PyString_FromStringAndSize(NULL, cbAllocated);
1110 if (v == NULL)
1111 return NULL;
1112
1113 start = out = PyString_AS_STRING(v);
1114 for (;i < size; ++i) {
1115 Py_UNICODE ch = s[i];
1116
1117 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001118 if (ch == '+') {
1119 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001120 *out++ = '-';
1121 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1122 charsleft = ch;
1123 bitsleft = 16;
1124 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001125 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001126 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001127 } else {
1128 *out++ = (char) ch;
1129 }
1130 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001131 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1132 *out++ = B64(charsleft << (6-bitsleft));
1133 charsleft = 0;
1134 bitsleft = 0;
1135 /* Characters not in the BASE64 set implicitly unshift the sequence
1136 so no '-' is required, except if the character is itself a '-' */
1137 if (B64CHAR(ch) || ch == '-') {
1138 *out++ = '-';
1139 }
1140 inShift = 0;
1141 *out++ = (char) ch;
1142 } else {
1143 bitsleft += 16;
1144 charsleft = (charsleft << 16) | ch;
1145 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1146
1147 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001148 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001149 or '-' then the shift sequence will be terminated implicitly and we
1150 don't have to insert a '-'. */
1151
1152 if (bitsleft == 0) {
1153 if (i + 1 < size) {
1154 Py_UNICODE ch2 = s[i+1];
1155
1156 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001157
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001158 } else if (B64CHAR(ch2) || ch2 == '-') {
1159 *out++ = '-';
1160 inShift = 0;
1161 } else {
1162 inShift = 0;
1163 }
1164
1165 }
1166 else {
1167 *out++ = '-';
1168 inShift = 0;
1169 }
1170 }
Tim Petersced69f82003-09-16 20:30:58 +00001171 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001172 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001173 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001174 if (bitsleft) {
1175 *out++= B64(charsleft << (6-bitsleft) );
1176 *out++ = '-';
1177 }
1178
Tim Peters5de98422002-04-27 18:44:32 +00001179 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001180 return v;
1181}
1182
1183#undef SPECIAL
1184#undef B64
1185#undef B64CHAR
1186#undef UB64
1187#undef ENCODE
1188#undef DECODE
1189
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190/* --- UTF-8 Codec -------------------------------------------------------- */
1191
Tim Petersced69f82003-09-16 20:30:58 +00001192static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193char utf8_code_length[256] = {
1194 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1195 illegal prefix. see RFC 2279 for details */
1196 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1197 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1198 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1199 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1200 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1201 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1202 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1203 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1205 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1206 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1208 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1209 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1210 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1211 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1212};
1213
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001215 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 const char *errors)
1217{
Walter Dörwald69652032004-09-07 20:24:22 +00001218 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1219}
1220
1221PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001222 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001223 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001224 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001225{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001226 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001228 Py_ssize_t startinpos;
1229 Py_ssize_t endinpos;
1230 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 const char *e;
1232 PyUnicodeObject *unicode;
1233 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001234 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001235 PyObject *errorHandler = NULL;
1236 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237
1238 /* Note: size will always be longer than the resulting Unicode
1239 character count */
1240 unicode = _PyUnicode_New(size);
1241 if (!unicode)
1242 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001243 if (size == 0) {
1244 if (consumed)
1245 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248
1249 /* Unpack UTF-8 encoded data */
1250 p = unicode->str;
1251 e = s + size;
1252
1253 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001254 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255
1256 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001257 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 s++;
1259 continue;
1260 }
1261
1262 n = utf8_code_length[ch];
1263
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001264 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001265 if (consumed)
1266 break;
1267 else {
1268 errmsg = "unexpected end of data";
1269 startinpos = s-starts;
1270 endinpos = size;
1271 goto utf8Error;
1272 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001273 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274
1275 switch (n) {
1276
1277 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001279 startinpos = s-starts;
1280 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001281 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001282
1283 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001284 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001285 startinpos = s-starts;
1286 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288
1289 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001290 if ((s[1] & 0xc0) != 0x80) {
1291 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001292 startinpos = s-starts;
1293 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001294 goto utf8Error;
1295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001298 startinpos = s-starts;
1299 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001300 errmsg = "illegal encoding";
1301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001304 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001305 break;
1306
1307 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001308 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001309 (s[2] & 0xc0) != 0x80) {
1310 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001311 startinpos = s-starts;
1312 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001313 goto utf8Error;
1314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001315 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001316 if (ch < 0x0800) {
1317 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001318 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001319
1320 XXX For wide builds (UCS-4) we should probably try
1321 to recombine the surrogates into a single code
1322 unit.
1323 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001325 startinpos = s-starts;
1326 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001327 goto utf8Error;
1328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001330 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001331 break;
1332
1333 case 4:
1334 if ((s[1] & 0xc0) != 0x80 ||
1335 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001336 (s[3] & 0xc0) != 0x80) {
1337 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001338 startinpos = s-starts;
1339 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001340 goto utf8Error;
1341 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001342 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1343 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1344 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001345 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001346 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001347 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001348 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001349 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001350 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001351 startinpos = s-starts;
1352 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001353 goto utf8Error;
1354 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001355#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356 *p++ = (Py_UNICODE)ch;
1357#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001358 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001359
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001360 /* translate from 10000..10FFFF to 0..FFFF */
1361 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001362
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001363 /* high surrogate = top 10 bits added to D800 */
1364 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001365
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001366 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001367 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001368#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001369 break;
1370
1371 default:
1372 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001373 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001374 startinpos = s-starts;
1375 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001376 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
1378 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001379 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001380
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001381 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001382 outpos = p-PyUnicode_AS_UNICODE(unicode);
1383 if (unicode_decode_call_errorhandler(
1384 errors, &errorHandler,
1385 "utf8", errmsg,
1386 starts, size, &startinpos, &endinpos, &exc, &s,
1387 (PyObject **)&unicode, &outpos, &p))
1388 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 }
Walter Dörwald69652032004-09-07 20:24:22 +00001390 if (consumed)
1391 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392
1393 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001394 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001395 goto onError;
1396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001397 Py_XDECREF(errorHandler);
1398 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001399 return (PyObject *)unicode;
1400
1401onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001402 Py_XDECREF(errorHandler);
1403 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404 Py_DECREF(unicode);
1405 return NULL;
1406}
1407
Tim Peters602f7402002-04-27 18:03:26 +00001408/* Allocation strategy: if the string is short, convert into a stack buffer
1409 and allocate exactly as much space needed at the end. Else allocate the
1410 maximum possible needed (4 result bytes per Unicode character), and return
1411 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001412*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001413PyObject *
1414PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001415 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001416 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417{
Tim Peters602f7402002-04-27 18:03:26 +00001418#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001419
Martin v. Löwis18e16552006-02-15 17:27:45 +00001420 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001421 PyObject *v; /* result string object */
1422 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001423 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001424 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001425 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001426
Tim Peters602f7402002-04-27 18:03:26 +00001427 assert(s != NULL);
1428 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429
Tim Peters602f7402002-04-27 18:03:26 +00001430 if (size <= MAX_SHORT_UNICHARS) {
1431 /* Write into the stack buffer; nallocated can't overflow.
1432 * At the end, we'll allocate exactly as much heap space as it
1433 * turns out we need.
1434 */
1435 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1436 v = NULL; /* will allocate after we're done */
1437 p = stackbuf;
1438 }
1439 else {
1440 /* Overallocate on the heap, and give the excess back at the end. */
1441 nallocated = size * 4;
1442 if (nallocated / 4 != size) /* overflow! */
1443 return PyErr_NoMemory();
1444 v = PyString_FromStringAndSize(NULL, nallocated);
1445 if (v == NULL)
1446 return NULL;
1447 p = PyString_AS_STRING(v);
1448 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001449
Tim Peters602f7402002-04-27 18:03:26 +00001450 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001451 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001452
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001453 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001454 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001455 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001456
Guido van Rossumd57fd912000-03-10 22:53:23 +00001457 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001458 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001459 *p++ = (char)(0xc0 | (ch >> 6));
1460 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001461 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001462 else {
Tim Peters602f7402002-04-27 18:03:26 +00001463 /* Encode UCS2 Unicode ordinals */
1464 if (ch < 0x10000) {
1465 /* Special case: check for high surrogate */
1466 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1467 Py_UCS4 ch2 = s[i];
1468 /* Check for low surrogate and combine the two to
1469 form a UCS4 value */
1470 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001471 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001472 i++;
1473 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001474 }
Tim Peters602f7402002-04-27 18:03:26 +00001475 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001476 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001477 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001478 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1479 *p++ = (char)(0x80 | (ch & 0x3f));
1480 continue;
1481 }
1482encodeUCS4:
1483 /* Encode UCS4 Unicode ordinals */
1484 *p++ = (char)(0xf0 | (ch >> 18));
1485 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1486 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1487 *p++ = (char)(0x80 | (ch & 0x3f));
1488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001490
Tim Peters602f7402002-04-27 18:03:26 +00001491 if (v == NULL) {
1492 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001493 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001494 assert(nneeded <= nallocated);
1495 v = PyString_FromStringAndSize(stackbuf, nneeded);
1496 }
1497 else {
1498 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001499 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001500 assert(nneeded <= nallocated);
1501 _PyString_Resize(&v, nneeded);
1502 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001504
Tim Peters602f7402002-04-27 18:03:26 +00001505#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506}
1507
Guido van Rossumd57fd912000-03-10 22:53:23 +00001508PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1509{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510 if (!PyUnicode_Check(unicode)) {
1511 PyErr_BadArgument();
1512 return NULL;
1513 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001514 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1515 PyUnicode_GET_SIZE(unicode),
1516 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517}
1518
1519/* --- UTF-16 Codec ------------------------------------------------------- */
1520
Tim Peters772747b2001-08-09 22:21:55 +00001521PyObject *
1522PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001524 const char *errors,
1525 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526{
Walter Dörwald69652032004-09-07 20:24:22 +00001527 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1528}
1529
1530PyObject *
1531PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001532 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001533 const char *errors,
1534 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001535 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001536{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001537 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001538 Py_ssize_t startinpos;
1539 Py_ssize_t endinpos;
1540 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541 PyUnicodeObject *unicode;
1542 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001543 const unsigned char *q, *e;
1544 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001545 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001546 /* Offsets from q for retrieving byte pairs in the right order. */
1547#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1548 int ihi = 1, ilo = 0;
1549#else
1550 int ihi = 0, ilo = 1;
1551#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001552 PyObject *errorHandler = NULL;
1553 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554
1555 /* Note: size will always be longer than the resulting Unicode
1556 character count */
1557 unicode = _PyUnicode_New(size);
1558 if (!unicode)
1559 return NULL;
1560 if (size == 0)
1561 return (PyObject *)unicode;
1562
1563 /* Unpack UTF-16 encoded data */
1564 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001565 q = (unsigned char *)s;
1566 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001567
1568 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001569 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001571 /* Check for BOM marks (U+FEFF) in the input and adjust current
1572 byte order setting accordingly. In native mode, the leading BOM
1573 mark is skipped, in all other modes, it is copied to the output
1574 stream as-is (giving a ZWNBSP character). */
1575 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001576 if (size >= 2) {
1577 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001578#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001579 if (bom == 0xFEFF) {
1580 q += 2;
1581 bo = -1;
1582 }
1583 else if (bom == 0xFFFE) {
1584 q += 2;
1585 bo = 1;
1586 }
Tim Petersced69f82003-09-16 20:30:58 +00001587#else
Walter Dörwald69652032004-09-07 20:24:22 +00001588 if (bom == 0xFEFF) {
1589 q += 2;
1590 bo = 1;
1591 }
1592 else if (bom == 0xFFFE) {
1593 q += 2;
1594 bo = -1;
1595 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001596#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001597 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599
Tim Peters772747b2001-08-09 22:21:55 +00001600 if (bo == -1) {
1601 /* force LE */
1602 ihi = 1;
1603 ilo = 0;
1604 }
1605 else if (bo == 1) {
1606 /* force BE */
1607 ihi = 0;
1608 ilo = 1;
1609 }
1610
1611 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001612 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001613 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001614 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001615 if (consumed)
1616 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001617 errmsg = "truncated data";
1618 startinpos = ((const char *)q)-starts;
1619 endinpos = ((const char *)e)-starts;
1620 goto utf16Error;
1621 /* The remaining input chars are ignored if the callback
1622 chooses to skip the input */
1623 }
1624 ch = (q[ihi] << 8) | q[ilo];
1625
Tim Peters772747b2001-08-09 22:21:55 +00001626 q += 2;
1627
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628 if (ch < 0xD800 || ch > 0xDFFF) {
1629 *p++ = ch;
1630 continue;
1631 }
1632
1633 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001634 if (q >= e) {
1635 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001636 startinpos = (((const char *)q)-2)-starts;
1637 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001638 goto utf16Error;
1639 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001640 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001641 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1642 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001643 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001644#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001645 *p++ = ch;
1646 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001647#else
1648 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001649#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001650 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001651 }
1652 else {
1653 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 startinpos = (((const char *)q)-4)-starts;
1655 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001656 goto utf16Error;
1657 }
1658
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001660 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001661 startinpos = (((const char *)q)-2)-starts;
1662 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001663 /* Fall through to report the error */
1664
1665 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001666 outpos = p-PyUnicode_AS_UNICODE(unicode);
1667 if (unicode_decode_call_errorhandler(
1668 errors, &errorHandler,
1669 "utf16", errmsg,
1670 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1671 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001672 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 }
1674
1675 if (byteorder)
1676 *byteorder = bo;
1677
Walter Dörwald69652032004-09-07 20:24:22 +00001678 if (consumed)
1679 *consumed = (const char *)q-starts;
1680
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001682 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001683 goto onError;
1684
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001685 Py_XDECREF(errorHandler);
1686 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 return (PyObject *)unicode;
1688
1689onError:
1690 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001691 Py_XDECREF(errorHandler);
1692 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 return NULL;
1694}
1695
Tim Peters772747b2001-08-09 22:21:55 +00001696PyObject *
1697PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001698 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001699 const char *errors,
1700 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701{
1702 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001703 unsigned char *p;
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00001704 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001705#ifdef Py_UNICODE_WIDE
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00001706 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001707#else
1708 const int pairs = 0;
1709#endif
Tim Peters772747b2001-08-09 22:21:55 +00001710 /* Offsets from p for storing byte pairs in the right order. */
1711#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1712 int ihi = 1, ilo = 0;
1713#else
1714 int ihi = 0, ilo = 1;
1715#endif
1716
1717#define STORECHAR(CH) \
1718 do { \
1719 p[ihi] = ((CH) >> 8) & 0xff; \
1720 p[ilo] = (CH) & 0xff; \
1721 p += 2; \
1722 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001724#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001725 for (i = pairs = 0; i < size; i++)
1726 if (s[i] >= 0x10000)
1727 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001728#endif
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00001729 /* 2 * (size + pairs + (byteorder == 0)) */
1730 if (size > PY_SSIZE_T_MAX ||
1731 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
1732 return PyErr_NoMemory();
1733 nsize = (size + pairs + (byteorder == 0));
1734 bytesize = nsize * 2;
1735 if (bytesize / 2 != nsize)
1736 return PyErr_NoMemory();
1737 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 if (v == NULL)
1739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740
Tim Peters772747b2001-08-09 22:21:55 +00001741 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001743 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001744 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001745 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001746
1747 if (byteorder == -1) {
1748 /* force LE */
1749 ihi = 1;
1750 ilo = 0;
1751 }
1752 else if (byteorder == 1) {
1753 /* force BE */
1754 ihi = 0;
1755 ilo = 1;
1756 }
1757
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001758 while (size-- > 0) {
1759 Py_UNICODE ch = *s++;
1760 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001761#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001762 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001763 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1764 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001766#endif
Tim Peters772747b2001-08-09 22:21:55 +00001767 STORECHAR(ch);
1768 if (ch2)
1769 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001772#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773}
1774
1775PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1776{
1777 if (!PyUnicode_Check(unicode)) {
1778 PyErr_BadArgument();
1779 return NULL;
1780 }
1781 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1782 PyUnicode_GET_SIZE(unicode),
1783 NULL,
1784 0);
1785}
1786
1787/* --- Unicode Escape Codec ----------------------------------------------- */
1788
Fredrik Lundh06d12682001-01-24 07:59:11 +00001789static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001790
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001792 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 const char *errors)
1794{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001796 Py_ssize_t startinpos;
1797 Py_ssize_t endinpos;
1798 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001803 char* message;
1804 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 PyObject *errorHandler = NULL;
1806 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001807
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 /* Escaped strings will always be longer than the resulting
1809 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 length after conversion to the true value.
1811 (but if the error callback returns a long replacement string
1812 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001813 v = _PyUnicode_New(size);
1814 if (v == NULL)
1815 goto onError;
1816 if (size == 0)
1817 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001818
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001819 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001820 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001821
Guido van Rossumd57fd912000-03-10 22:53:23 +00001822 while (s < end) {
1823 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001824 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001825 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826
1827 /* Non-escape characters are interpreted as Unicode ordinals */
1828 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001829 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001830 continue;
1831 }
1832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001833 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834 /* \ - Escapes */
1835 s++;
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001836 c = *s++;
1837 if (s > end)
1838 c = '\0'; /* Invalid after \ */
1839 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840
1841 /* \x escapes */
1842 case '\n': break;
1843 case '\\': *p++ = '\\'; break;
1844 case '\'': *p++ = '\''; break;
1845 case '\"': *p++ = '\"'; break;
1846 case 'b': *p++ = '\b'; break;
1847 case 'f': *p++ = '\014'; break; /* FF */
1848 case 't': *p++ = '\t'; break;
1849 case 'n': *p++ = '\n'; break;
1850 case 'r': *p++ = '\r'; break;
1851 case 'v': *p++ = '\013'; break; /* VT */
1852 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1853
1854 /* \OOO (octal) escapes */
1855 case '0': case '1': case '2': case '3':
1856 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001857 x = s[-1] - '0';
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001858 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001859 x = (x<<3) + *s++ - '0';
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001860 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001861 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001863 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 break;
1865
Fredrik Lundhccc74732001-02-18 22:13:49 +00001866 /* hex escapes */
1867 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001869 digits = 2;
1870 message = "truncated \\xXX escape";
1871 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872
Fredrik Lundhccc74732001-02-18 22:13:49 +00001873 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001874 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001875 digits = 4;
1876 message = "truncated \\uXXXX escape";
1877 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878
Fredrik Lundhccc74732001-02-18 22:13:49 +00001879 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001880 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001881 digits = 8;
1882 message = "truncated \\UXXXXXXXX escape";
1883 hexescape:
1884 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001885 outpos = p-PyUnicode_AS_UNICODE(v);
1886 if (s+digits>end) {
1887 endinpos = size;
1888 if (unicode_decode_call_errorhandler(
1889 errors, &errorHandler,
1890 "unicodeescape", "end of string in escape sequence",
1891 starts, size, &startinpos, &endinpos, &exc, &s,
1892 (PyObject **)&v, &outpos, &p))
1893 goto onError;
1894 goto nextByte;
1895 }
1896 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001897 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001898 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001899 endinpos = (s+i+1)-starts;
1900 if (unicode_decode_call_errorhandler(
1901 errors, &errorHandler,
1902 "unicodeescape", message,
1903 starts, size, &startinpos, &endinpos, &exc, &s,
1904 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001905 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001906 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001907 }
1908 chr = (chr<<4) & ~0xF;
1909 if (c >= '0' && c <= '9')
1910 chr += c - '0';
1911 else if (c >= 'a' && c <= 'f')
1912 chr += 10 + c - 'a';
1913 else
1914 chr += 10 + c - 'A';
1915 }
1916 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001917 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001918 /* _decoding_error will have already written into the
1919 target buffer. */
1920 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001921 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001922 /* when we get here, chr is a 32-bit unicode character */
1923 if (chr <= 0xffff)
1924 /* UCS-2 character */
1925 *p++ = (Py_UNICODE) chr;
1926 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001927 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001928 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001929#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001930 *p++ = chr;
1931#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001932 chr -= 0x10000L;
1933 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001934 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001935#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001936 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001937 endinpos = s-starts;
1938 outpos = p-PyUnicode_AS_UNICODE(v);
1939 if (unicode_decode_call_errorhandler(
1940 errors, &errorHandler,
1941 "unicodeescape", "illegal Unicode character",
1942 starts, size, &startinpos, &endinpos, &exc, &s,
1943 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001944 goto onError;
1945 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001946 break;
1947
1948 /* \N{name} */
1949 case 'N':
1950 message = "malformed \\N character escape";
1951 if (ucnhash_CAPI == NULL) {
1952 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001953 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001954 m = PyImport_ImportModule("unicodedata");
1955 if (m == NULL)
1956 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001957 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001958 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001959 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001960 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001961 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001962 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001963 if (ucnhash_CAPI == NULL)
1964 goto ucnhashError;
1965 }
1966 if (*s == '{') {
1967 const char *start = s+1;
1968 /* look for the closing brace */
1969 while (*s != '}' && s < end)
1970 s++;
1971 if (s > start && s < end && *s == '}') {
1972 /* found a name. look it up in the unicode database */
1973 message = "unknown Unicode character name";
1974 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001975 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001976 goto store;
1977 }
1978 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001979 endinpos = s-starts;
1980 outpos = p-PyUnicode_AS_UNICODE(v);
1981 if (unicode_decode_call_errorhandler(
1982 errors, &errorHandler,
1983 "unicodeescape", message,
1984 starts, size, &startinpos, &endinpos, &exc, &s,
1985 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001986 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001987 break;
1988
1989 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001990 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001991 message = "\\ at end of string";
1992 s--;
1993 endinpos = s-starts;
1994 outpos = p-PyUnicode_AS_UNICODE(v);
1995 if (unicode_decode_call_errorhandler(
1996 errors, &errorHandler,
1997 "unicodeescape", message,
1998 starts, size, &startinpos, &endinpos, &exc, &s,
1999 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002000 goto onError;
2001 }
2002 else {
2003 *p++ = '\\';
2004 *p++ = (unsigned char)s[-1];
2005 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002006 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002008 nextByte:
2009 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002011 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002012 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002013 Py_XDECREF(errorHandler);
2014 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002016
Fredrik Lundhccc74732001-02-18 22:13:49 +00002017ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002018 PyErr_SetString(
2019 PyExc_UnicodeError,
2020 "\\N escapes not supported (can't load unicodedata module)"
2021 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002022 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002023 Py_XDECREF(errorHandler);
2024 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002025 return NULL;
2026
Fredrik Lundhccc74732001-02-18 22:13:49 +00002027onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002029 Py_XDECREF(errorHandler);
2030 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031 return NULL;
2032}
2033
2034/* Return a Unicode-Escape string version of the Unicode object.
2035
2036 If quotes is true, the string is enclosed in u"" or u'' quotes as
2037 appropriate.
2038
2039*/
2040
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002041Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002042 Py_ssize_t size,
2043 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002044{
2045 /* like wcschr, but doesn't stop at NULL characters */
2046
2047 while (size-- > 0) {
2048 if (*s == ch)
2049 return s;
2050 s++;
2051 }
2052
2053 return NULL;
2054}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002055
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056static
2057PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002058 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 int quotes)
2060{
2061 PyObject *repr;
2062 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002064 static const char *hexdigit = "0123456789abcdef";
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00002065#ifdef Py_UNICODE_WIDE
2066 const Py_ssize_t expandsize = 10;
2067#else
2068 const Py_ssize_t expandsize = 6;
2069#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002071 /* Initial allocation is based on the longest-possible unichr
2072 escape.
2073
2074 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2075 unichr, so in this case it's the longest unichr escape. In
2076 narrow (UTF-16) builds this is five chars per source unichr
2077 since there are two unichrs in the surrogate pair, so in narrow
2078 (UTF-16) builds it's not the longest unichr escape.
2079
2080 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2081 so in the narrow (UTF-16) build case it's the longest unichr
2082 escape.
2083 */
2084
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00002085 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
2086 return PyErr_NoMemory();
2087
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002088 repr = PyString_FromStringAndSize(NULL,
2089 2
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00002090 + expandsize*size
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002091 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 if (repr == NULL)
2093 return NULL;
2094
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002095 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096
2097 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002098 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002099 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100 !findchar(s, size, '"')) ? '"' : '\'';
2101 }
2102 while (size-- > 0) {
2103 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002104
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002105 /* Escape quotes and backslashes */
2106 if ((quotes &&
2107 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108 *p++ = '\\';
2109 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002110 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002111 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002112
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002113#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002114 /* Map 21-bit characters to '\U00xxxxxx' */
2115 else if (ch >= 0x10000) {
2116 *p++ = '\\';
2117 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002118 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2119 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2120 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2121 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2122 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2123 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2124 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002125 *p++ = hexdigit[ch & 0x0000000F];
2126 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002127 }
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002128#else
2129 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002130 else if (ch >= 0xD800 && ch < 0xDC00) {
2131 Py_UNICODE ch2;
2132 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002133
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002134 ch2 = *s++;
2135 size--;
2136 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2137 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2138 *p++ = '\\';
2139 *p++ = 'U';
2140 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2141 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2142 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2143 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2144 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2145 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2146 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2147 *p++ = hexdigit[ucs & 0x0000000F];
2148 continue;
2149 }
2150 /* Fall through: isolated surrogates are copied as-is */
2151 s--;
2152 size++;
2153 }
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002154#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002155
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002157 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 *p++ = '\\';
2159 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002160 *p++ = hexdigit[(ch >> 12) & 0x000F];
2161 *p++ = hexdigit[(ch >> 8) & 0x000F];
2162 *p++ = hexdigit[(ch >> 4) & 0x000F];
2163 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002165
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002166 /* Map special whitespace to '\t', \n', '\r' */
2167 else if (ch == '\t') {
2168 *p++ = '\\';
2169 *p++ = 't';
2170 }
2171 else if (ch == '\n') {
2172 *p++ = '\\';
2173 *p++ = 'n';
2174 }
2175 else if (ch == '\r') {
2176 *p++ = '\\';
2177 *p++ = 'r';
2178 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002179
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002180 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002181 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002183 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002184 *p++ = hexdigit[(ch >> 4) & 0x000F];
2185 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002186 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002187
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 /* Copy everything else as-is */
2189 else
2190 *p++ = (char) ch;
2191 }
2192 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002193 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002194
2195 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002196 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 return repr;
2198}
2199
2200PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002201 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202{
2203 return unicodeescape_string(s, size, 0);
2204}
2205
2206PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2207{
2208 if (!PyUnicode_Check(unicode)) {
2209 PyErr_BadArgument();
2210 return NULL;
2211 }
2212 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2213 PyUnicode_GET_SIZE(unicode));
2214}
2215
2216/* --- Raw Unicode Escape Codec ------------------------------------------- */
2217
2218PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002219 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220 const char *errors)
2221{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002222 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002223 Py_ssize_t startinpos;
2224 Py_ssize_t endinpos;
2225 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002227 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 const char *end;
2229 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002230 PyObject *errorHandler = NULL;
2231 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002232
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 /* Escaped strings will always be longer than the resulting
2234 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002235 length after conversion to the true value. (But decoding error
2236 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237 v = _PyUnicode_New(size);
2238 if (v == NULL)
2239 goto onError;
2240 if (size == 0)
2241 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002242 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243 end = s + size;
2244 while (s < end) {
2245 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002246 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002248 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249
2250 /* Non-escape characters are interpreted as Unicode ordinals */
2251 if (*s != '\\') {
2252 *p++ = (unsigned char)*s++;
2253 continue;
2254 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002255 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256
2257 /* \u-escapes are only interpreted iff the number of leading
2258 backslashes if odd */
2259 bs = s;
2260 for (;s < end;) {
2261 if (*s != '\\')
2262 break;
2263 *p++ = (unsigned char)*s++;
2264 }
2265 if (((s - bs) & 1) == 0 ||
2266 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002267 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 continue;
2269 }
2270 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002271 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 s++;
2273
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002274 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002275 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002276 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002277 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002279 endinpos = s-starts;
2280 if (unicode_decode_call_errorhandler(
2281 errors, &errorHandler,
2282 "rawunicodeescape", "truncated \\uXXXX",
2283 starts, size, &startinpos, &endinpos, &exc, &s,
2284 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002286 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287 }
2288 x = (x<<4) & ~0xF;
2289 if (c >= '0' && c <= '9')
2290 x += c - '0';
2291 else if (c >= 'a' && c <= 'f')
2292 x += 10 + c - 'a';
2293 else
2294 x += 10 + c - 'A';
2295 }
Amaury Forgeot d'Arca79e0502008-03-24 21:16:28 +00002296 if (x <= 0xffff)
2297 /* UCS-2 character */
2298 *p++ = (Py_UNICODE) x;
2299 else if (x <= 0x10ffff) {
2300 /* UCS-4 character. Either store directly, or as
2301 surrogate pair. */
2302#ifdef Py_UNICODE_WIDE
2303 *p++ = (Py_UNICODE) x;
2304#else
2305 x -= 0x10000L;
2306 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
2307 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
2308#endif
2309 } else {
2310 endinpos = s-starts;
2311 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002312 if (unicode_decode_call_errorhandler(
2313 errors, &errorHandler,
2314 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2315 starts, size, &startinpos, &endinpos, &exc, &s,
2316 (PyObject **)&v, &outpos, &p))
2317 goto onError;
2318 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002319 nextByte:
2320 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002321 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002322 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002323 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002324 Py_XDECREF(errorHandler);
2325 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002327
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328 onError:
2329 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002330 Py_XDECREF(errorHandler);
2331 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002332 return NULL;
2333}
2334
2335PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002336 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337{
2338 PyObject *repr;
2339 char *p;
2340 char *q;
2341
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002342 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002343#ifdef Py_UNICODE_WIDE
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00002344 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002345#else
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00002346 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002347#endif
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00002348
2349 if (size > PY_SSIZE_T_MAX / expandsize)
2350 return PyErr_NoMemory();
2351
2352 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353 if (repr == NULL)
2354 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002355 if (size == 0)
2356 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357
2358 p = q = PyString_AS_STRING(repr);
2359 while (size-- > 0) {
2360 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002361#ifdef Py_UNICODE_WIDE
2362 /* Map 32-bit characters to '\Uxxxxxxxx' */
2363 if (ch >= 0x10000) {
2364 *p++ = '\\';
2365 *p++ = 'U';
2366 *p++ = hexdigit[(ch >> 28) & 0xf];
2367 *p++ = hexdigit[(ch >> 24) & 0xf];
2368 *p++ = hexdigit[(ch >> 20) & 0xf];
2369 *p++ = hexdigit[(ch >> 16) & 0xf];
2370 *p++ = hexdigit[(ch >> 12) & 0xf];
2371 *p++ = hexdigit[(ch >> 8) & 0xf];
2372 *p++ = hexdigit[(ch >> 4) & 0xf];
2373 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002374 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002375 else
Amaury Forgeot d'Arca79e0502008-03-24 21:16:28 +00002376#else
2377 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2378 if (ch >= 0xD800 && ch < 0xDC00) {
2379 Py_UNICODE ch2;
2380 Py_UCS4 ucs;
2381
2382 ch2 = *s++;
2383 size--;
2384 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2385 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2386 *p++ = '\\';
2387 *p++ = 'U';
2388 *p++ = hexdigit[(ucs >> 28) & 0xf];
2389 *p++ = hexdigit[(ucs >> 24) & 0xf];
2390 *p++ = hexdigit[(ucs >> 20) & 0xf];
2391 *p++ = hexdigit[(ucs >> 16) & 0xf];
2392 *p++ = hexdigit[(ucs >> 12) & 0xf];
2393 *p++ = hexdigit[(ucs >> 8) & 0xf];
2394 *p++ = hexdigit[(ucs >> 4) & 0xf];
2395 *p++ = hexdigit[ucs & 0xf];
2396 continue;
2397 }
2398 /* Fall through: isolated surrogates are copied as-is */
2399 s--;
2400 size++;
2401 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002402#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 /* Map 16-bit characters to '\uxxxx' */
2404 if (ch >= 256) {
2405 *p++ = '\\';
2406 *p++ = 'u';
2407 *p++ = hexdigit[(ch >> 12) & 0xf];
2408 *p++ = hexdigit[(ch >> 8) & 0xf];
2409 *p++ = hexdigit[(ch >> 4) & 0xf];
2410 *p++ = hexdigit[ch & 15];
2411 }
2412 /* Copy everything else as-is */
2413 else
2414 *p++ = (char) ch;
2415 }
2416 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002417 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 return repr;
2419}
2420
2421PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2422{
2423 if (!PyUnicode_Check(unicode)) {
2424 PyErr_BadArgument();
2425 return NULL;
2426 }
2427 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2428 PyUnicode_GET_SIZE(unicode));
2429}
2430
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002431/* --- Unicode Internal Codec ------------------------------------------- */
2432
2433PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002434 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002435 const char *errors)
2436{
2437 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002438 Py_ssize_t startinpos;
2439 Py_ssize_t endinpos;
2440 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002441 PyUnicodeObject *v;
2442 Py_UNICODE *p;
2443 const char *end;
2444 const char *reason;
2445 PyObject *errorHandler = NULL;
2446 PyObject *exc = NULL;
2447
Neal Norwitzd43069c2006-01-08 01:12:10 +00002448#ifdef Py_UNICODE_WIDE
2449 Py_UNICODE unimax = PyUnicode_GetMax();
2450#endif
2451
Armin Rigo4b63c212006-10-04 11:44:06 +00002452 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002453 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2454 if (v == NULL)
2455 goto onError;
2456 if (PyUnicode_GetSize((PyObject *)v) == 0)
2457 return (PyObject *)v;
2458 p = PyUnicode_AS_UNICODE(v);
2459 end = s + size;
2460
2461 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002462 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002463 /* We have to sanity check the raw data, otherwise doom looms for
2464 some malformed UCS-4 data. */
2465 if (
2466 #ifdef Py_UNICODE_WIDE
2467 *p > unimax || *p < 0 ||
2468 #endif
2469 end-s < Py_UNICODE_SIZE
2470 )
2471 {
2472 startinpos = s - starts;
2473 if (end-s < Py_UNICODE_SIZE) {
2474 endinpos = end-starts;
2475 reason = "truncated input";
2476 }
2477 else {
2478 endinpos = s - starts + Py_UNICODE_SIZE;
2479 reason = "illegal code point (> 0x10FFFF)";
2480 }
2481 outpos = p - PyUnicode_AS_UNICODE(v);
2482 if (unicode_decode_call_errorhandler(
2483 errors, &errorHandler,
2484 "unicode_internal", reason,
2485 starts, size, &startinpos, &endinpos, &exc, &s,
2486 (PyObject **)&v, &outpos, &p)) {
2487 goto onError;
2488 }
2489 }
2490 else {
2491 p++;
2492 s += Py_UNICODE_SIZE;
2493 }
2494 }
2495
Martin v. Löwis412fb672006-04-13 06:34:32 +00002496 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002497 goto onError;
2498 Py_XDECREF(errorHandler);
2499 Py_XDECREF(exc);
2500 return (PyObject *)v;
2501
2502 onError:
2503 Py_XDECREF(v);
2504 Py_XDECREF(errorHandler);
2505 Py_XDECREF(exc);
2506 return NULL;
2507}
2508
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509/* --- Latin-1 Codec ------------------------------------------------------ */
2510
2511PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002512 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 const char *errors)
2514{
2515 PyUnicodeObject *v;
2516 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002517
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002519 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002520 Py_UNICODE r = *(unsigned char*)s;
2521 return PyUnicode_FromUnicode(&r, 1);
2522 }
2523
Guido van Rossumd57fd912000-03-10 22:53:23 +00002524 v = _PyUnicode_New(size);
2525 if (v == NULL)
2526 goto onError;
2527 if (size == 0)
2528 return (PyObject *)v;
2529 p = PyUnicode_AS_UNICODE(v);
2530 while (size-- > 0)
2531 *p++ = (unsigned char)*s++;
2532 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002533
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534 onError:
2535 Py_XDECREF(v);
2536 return NULL;
2537}
2538
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002539/* create or adjust a UnicodeEncodeError */
2540static void make_encode_exception(PyObject **exceptionObject,
2541 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002542 const Py_UNICODE *unicode, Py_ssize_t size,
2543 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002546 if (*exceptionObject == NULL) {
2547 *exceptionObject = PyUnicodeEncodeError_Create(
2548 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 }
2550 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002551 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2552 goto onError;
2553 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2554 goto onError;
2555 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2556 goto onError;
2557 return;
2558 onError:
2559 Py_DECREF(*exceptionObject);
2560 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 }
2562}
2563
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002564/* raises a UnicodeEncodeError */
2565static void raise_encode_exception(PyObject **exceptionObject,
2566 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002567 const Py_UNICODE *unicode, Py_ssize_t size,
2568 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002569 const char *reason)
2570{
2571 make_encode_exception(exceptionObject,
2572 encoding, unicode, size, startpos, endpos, reason);
2573 if (*exceptionObject != NULL)
2574 PyCodec_StrictErrors(*exceptionObject);
2575}
2576
2577/* error handling callback helper:
2578 build arguments, call the callback and check the arguments,
2579 put the result into newpos and return the replacement string, which
2580 has to be freed by the caller */
2581static PyObject *unicode_encode_call_errorhandler(const char *errors,
2582 PyObject **errorHandler,
2583 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002584 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2585 Py_ssize_t startpos, Py_ssize_t endpos,
2586 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002587{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002588 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002589
2590 PyObject *restuple;
2591 PyObject *resunicode;
2592
2593 if (*errorHandler == NULL) {
2594 *errorHandler = PyCodec_LookupError(errors);
2595 if (*errorHandler == NULL)
2596 return NULL;
2597 }
2598
2599 make_encode_exception(exceptionObject,
2600 encoding, unicode, size, startpos, endpos, reason);
2601 if (*exceptionObject == NULL)
2602 return NULL;
2603
2604 restuple = PyObject_CallFunctionObjArgs(
2605 *errorHandler, *exceptionObject, NULL);
2606 if (restuple == NULL)
2607 return NULL;
2608 if (!PyTuple_Check(restuple)) {
2609 PyErr_Format(PyExc_TypeError, &argparse[4]);
2610 Py_DECREF(restuple);
2611 return NULL;
2612 }
2613 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2614 &resunicode, newpos)) {
2615 Py_DECREF(restuple);
2616 return NULL;
2617 }
2618 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002619 *newpos = size+*newpos;
2620 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002621 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002622 Py_DECREF(restuple);
2623 return NULL;
2624 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002625 Py_INCREF(resunicode);
2626 Py_DECREF(restuple);
2627 return resunicode;
2628}
2629
2630static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002631 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002632 const char *errors,
2633 int limit)
2634{
2635 /* output object */
2636 PyObject *res;
2637 /* pointers to the beginning and end+1 of input */
2638 const Py_UNICODE *startp = p;
2639 const Py_UNICODE *endp = p + size;
2640 /* pointer to the beginning of the unencodable characters */
2641 /* const Py_UNICODE *badp = NULL; */
2642 /* pointer into the output */
2643 char *str;
2644 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002645 Py_ssize_t respos = 0;
2646 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002647 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2648 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002649 PyObject *errorHandler = NULL;
2650 PyObject *exc = NULL;
2651 /* the following variable is used for caching string comparisons
2652 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2653 int known_errorHandler = -1;
2654
2655 /* allocate enough for a simple encoding without
2656 replacements, if we need more, we'll resize */
2657 res = PyString_FromStringAndSize(NULL, size);
2658 if (res == NULL)
2659 goto onError;
2660 if (size == 0)
2661 return res;
2662 str = PyString_AS_STRING(res);
2663 ressize = size;
2664
2665 while (p<endp) {
2666 Py_UNICODE c = *p;
2667
2668 /* can we encode this? */
2669 if (c<limit) {
2670 /* no overflow check, because we know that the space is enough */
2671 *str++ = (char)c;
2672 ++p;
2673 }
2674 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002675 Py_ssize_t unicodepos = p-startp;
2676 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002677 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002678 Py_ssize_t repsize;
2679 Py_ssize_t newpos;
2680 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002681 Py_UNICODE *uni2;
2682 /* startpos for collecting unencodable chars */
2683 const Py_UNICODE *collstart = p;
2684 const Py_UNICODE *collend = p;
2685 /* find all unecodable characters */
2686 while ((collend < endp) && ((*collend)>=limit))
2687 ++collend;
2688 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2689 if (known_errorHandler==-1) {
2690 if ((errors==NULL) || (!strcmp(errors, "strict")))
2691 known_errorHandler = 1;
2692 else if (!strcmp(errors, "replace"))
2693 known_errorHandler = 2;
2694 else if (!strcmp(errors, "ignore"))
2695 known_errorHandler = 3;
2696 else if (!strcmp(errors, "xmlcharrefreplace"))
2697 known_errorHandler = 4;
2698 else
2699 known_errorHandler = 0;
2700 }
2701 switch (known_errorHandler) {
2702 case 1: /* strict */
2703 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2704 goto onError;
2705 case 2: /* replace */
2706 while (collstart++<collend)
2707 *str++ = '?'; /* fall through */
2708 case 3: /* ignore */
2709 p = collend;
2710 break;
2711 case 4: /* xmlcharrefreplace */
2712 respos = str-PyString_AS_STRING(res);
2713 /* determine replacement size (temporarily (mis)uses p) */
2714 for (p = collstart, repsize = 0; p < collend; ++p) {
2715 if (*p<10)
2716 repsize += 2+1+1;
2717 else if (*p<100)
2718 repsize += 2+2+1;
2719 else if (*p<1000)
2720 repsize += 2+3+1;
2721 else if (*p<10000)
2722 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002723#ifndef Py_UNICODE_WIDE
2724 else
2725 repsize += 2+5+1;
2726#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 else if (*p<100000)
2728 repsize += 2+5+1;
2729 else if (*p<1000000)
2730 repsize += 2+6+1;
2731 else
2732 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002733#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 }
2735 requiredsize = respos+repsize+(endp-collend);
2736 if (requiredsize > ressize) {
2737 if (requiredsize<2*ressize)
2738 requiredsize = 2*ressize;
2739 if (_PyString_Resize(&res, requiredsize))
2740 goto onError;
2741 str = PyString_AS_STRING(res) + respos;
2742 ressize = requiredsize;
2743 }
2744 /* generate replacement (temporarily (mis)uses p) */
2745 for (p = collstart; p < collend; ++p) {
2746 str += sprintf(str, "&#%d;", (int)*p);
2747 }
2748 p = collend;
2749 break;
2750 default:
2751 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2752 encoding, reason, startp, size, &exc,
2753 collstart-startp, collend-startp, &newpos);
2754 if (repunicode == NULL)
2755 goto onError;
2756 /* need more space? (at least enough for what we
2757 have+the replacement+the rest of the string, so
2758 we won't have to check space for encodable characters) */
2759 respos = str-PyString_AS_STRING(res);
2760 repsize = PyUnicode_GET_SIZE(repunicode);
2761 requiredsize = respos+repsize+(endp-collend);
2762 if (requiredsize > ressize) {
2763 if (requiredsize<2*ressize)
2764 requiredsize = 2*ressize;
2765 if (_PyString_Resize(&res, requiredsize)) {
2766 Py_DECREF(repunicode);
2767 goto onError;
2768 }
2769 str = PyString_AS_STRING(res) + respos;
2770 ressize = requiredsize;
2771 }
2772 /* check if there is anything unencodable in the replacement
2773 and copy it to the output */
2774 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2775 c = *uni2;
2776 if (c >= limit) {
2777 raise_encode_exception(&exc, encoding, startp, size,
2778 unicodepos, unicodepos+1, reason);
2779 Py_DECREF(repunicode);
2780 goto onError;
2781 }
2782 *str = (char)c;
2783 }
2784 p = startp + newpos;
2785 Py_DECREF(repunicode);
2786 }
2787 }
2788 }
2789 /* Resize if we allocated to much */
2790 respos = str-PyString_AS_STRING(res);
2791 if (respos<ressize)
2792 /* If this falls res will be NULL */
2793 _PyString_Resize(&res, respos);
2794 Py_XDECREF(errorHandler);
2795 Py_XDECREF(exc);
2796 return res;
2797
2798 onError:
2799 Py_XDECREF(res);
2800 Py_XDECREF(errorHandler);
2801 Py_XDECREF(exc);
2802 return NULL;
2803}
2804
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002806 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 const char *errors)
2808{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002809 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810}
2811
2812PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2813{
2814 if (!PyUnicode_Check(unicode)) {
2815 PyErr_BadArgument();
2816 return NULL;
2817 }
2818 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2819 PyUnicode_GET_SIZE(unicode),
2820 NULL);
2821}
2822
2823/* --- 7-bit ASCII Codec -------------------------------------------------- */
2824
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002826 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827 const char *errors)
2828{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002829 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 PyUnicodeObject *v;
2831 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002832 Py_ssize_t startinpos;
2833 Py_ssize_t endinpos;
2834 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 const char *e;
2836 PyObject *errorHandler = NULL;
2837 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002838
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002840 if (size == 1 && *(unsigned char*)s < 128) {
2841 Py_UNICODE r = *(unsigned char*)s;
2842 return PyUnicode_FromUnicode(&r, 1);
2843 }
Tim Petersced69f82003-09-16 20:30:58 +00002844
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 v = _PyUnicode_New(size);
2846 if (v == NULL)
2847 goto onError;
2848 if (size == 0)
2849 return (PyObject *)v;
2850 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002851 e = s + size;
2852 while (s < e) {
2853 register unsigned char c = (unsigned char)*s;
2854 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002856 ++s;
2857 }
2858 else {
2859 startinpos = s-starts;
2860 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002861 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002862 if (unicode_decode_call_errorhandler(
2863 errors, &errorHandler,
2864 "ascii", "ordinal not in range(128)",
2865 starts, size, &startinpos, &endinpos, &exc, &s,
2866 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002868 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002870 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002871 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002872 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002873 Py_XDECREF(errorHandler);
2874 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002876
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 onError:
2878 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 Py_XDECREF(errorHandler);
2880 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881 return NULL;
2882}
2883
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002885 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886 const char *errors)
2887{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002888 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889}
2890
2891PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2892{
2893 if (!PyUnicode_Check(unicode)) {
2894 PyErr_BadArgument();
2895 return NULL;
2896 }
2897 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2898 PyUnicode_GET_SIZE(unicode),
2899 NULL);
2900}
2901
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002902#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002903
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002904/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002905
Martin v. Löwisd8251432006-06-14 05:21:04 +00002906#if SIZEOF_INT < SIZEOF_SSIZE_T
2907#define NEED_RETRY
2908#endif
2909
2910/* XXX This code is limited to "true" double-byte encodings, as
2911 a) it assumes an incomplete character consists of a single byte, and
2912 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2913 encodings, see IsDBCSLeadByteEx documentation. */
2914
2915static int is_dbcs_lead_byte(const char *s, int offset)
2916{
2917 const char *curr = s + offset;
2918
2919 if (IsDBCSLeadByte(*curr)) {
2920 const char *prev = CharPrev(s, curr);
2921 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2922 }
2923 return 0;
2924}
2925
2926/*
2927 * Decode MBCS string into unicode object. If 'final' is set, converts
2928 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2929 */
2930static int decode_mbcs(PyUnicodeObject **v,
2931 const char *s, /* MBCS string */
2932 int size, /* sizeof MBCS string */
2933 int final)
2934{
2935 Py_UNICODE *p;
2936 Py_ssize_t n = 0;
2937 int usize = 0;
2938
2939 assert(size >= 0);
2940
2941 /* Skip trailing lead-byte unless 'final' is set */
2942 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2943 --size;
2944
2945 /* First get the size of the result */
2946 if (size > 0) {
2947 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2948 if (usize == 0) {
2949 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2950 return -1;
2951 }
2952 }
2953
2954 if (*v == NULL) {
2955 /* Create unicode object */
2956 *v = _PyUnicode_New(usize);
2957 if (*v == NULL)
2958 return -1;
2959 }
2960 else {
2961 /* Extend unicode object */
2962 n = PyUnicode_GET_SIZE(*v);
2963 if (_PyUnicode_Resize(v, n + usize) < 0)
2964 return -1;
2965 }
2966
2967 /* Do the conversion */
2968 if (size > 0) {
2969 p = PyUnicode_AS_UNICODE(*v) + n;
2970 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2971 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2972 return -1;
2973 }
2974 }
2975
2976 return size;
2977}
2978
2979PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2980 Py_ssize_t size,
2981 const char *errors,
2982 Py_ssize_t *consumed)
2983{
2984 PyUnicodeObject *v = NULL;
2985 int done;
2986
2987 if (consumed)
2988 *consumed = 0;
2989
2990#ifdef NEED_RETRY
2991 retry:
2992 if (size > INT_MAX)
2993 done = decode_mbcs(&v, s, INT_MAX, 0);
2994 else
2995#endif
2996 done = decode_mbcs(&v, s, (int)size, !consumed);
2997
2998 if (done < 0) {
2999 Py_XDECREF(v);
3000 return NULL;
3001 }
3002
3003 if (consumed)
3004 *consumed += done;
3005
3006#ifdef NEED_RETRY
3007 if (size > INT_MAX) {
3008 s += done;
3009 size -= done;
3010 goto retry;
3011 }
3012#endif
3013
3014 return (PyObject *)v;
3015}
3016
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003017PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003018 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003019 const char *errors)
3020{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003021 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3022}
3023
3024/*
3025 * Convert unicode into string object (MBCS).
3026 * Returns 0 if succeed, -1 otherwise.
3027 */
3028static int encode_mbcs(PyObject **repr,
3029 const Py_UNICODE *p, /* unicode */
3030 int size) /* size of unicode */
3031{
3032 int mbcssize = 0;
3033 Py_ssize_t n = 0;
3034
3035 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003036
3037 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003038 if (size > 0) {
3039 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3040 if (mbcssize == 0) {
3041 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3042 return -1;
3043 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003044 }
3045
Martin v. Löwisd8251432006-06-14 05:21:04 +00003046 if (*repr == NULL) {
3047 /* Create string object */
3048 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3049 if (*repr == NULL)
3050 return -1;
3051 }
3052 else {
3053 /* Extend string object */
3054 n = PyString_Size(*repr);
3055 if (_PyString_Resize(repr, n + mbcssize) < 0)
3056 return -1;
3057 }
3058
3059 /* Do the conversion */
3060 if (size > 0) {
3061 char *s = PyString_AS_STRING(*repr) + n;
3062 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3063 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3064 return -1;
3065 }
3066 }
3067
3068 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003069}
3070
3071PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003072 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003073 const char *errors)
3074{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003075 PyObject *repr = NULL;
3076 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003077
Martin v. Löwisd8251432006-06-14 05:21:04 +00003078#ifdef NEED_RETRY
3079 retry:
3080 if (size > INT_MAX)
3081 ret = encode_mbcs(&repr, p, INT_MAX);
3082 else
3083#endif
3084 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003085
Martin v. Löwisd8251432006-06-14 05:21:04 +00003086 if (ret < 0) {
3087 Py_XDECREF(repr);
3088 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003089 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003090
3091#ifdef NEED_RETRY
3092 if (size > INT_MAX) {
3093 p += INT_MAX;
3094 size -= INT_MAX;
3095 goto retry;
3096 }
3097#endif
3098
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003099 return repr;
3100}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003101
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003102PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3103{
3104 if (!PyUnicode_Check(unicode)) {
3105 PyErr_BadArgument();
3106 return NULL;
3107 }
3108 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3109 PyUnicode_GET_SIZE(unicode),
3110 NULL);
3111}
3112
Martin v. Löwisd8251432006-06-14 05:21:04 +00003113#undef NEED_RETRY
3114
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003115#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003116
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117/* --- Character Mapping Codec -------------------------------------------- */
3118
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003120 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 PyObject *mapping,
3122 const char *errors)
3123{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003124 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003125 Py_ssize_t startinpos;
3126 Py_ssize_t endinpos;
3127 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003128 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 PyUnicodeObject *v;
3130 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003131 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003132 PyObject *errorHandler = NULL;
3133 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003134 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003135 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003136
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 /* Default to Latin-1 */
3138 if (mapping == NULL)
3139 return PyUnicode_DecodeLatin1(s, size, errors);
3140
3141 v = _PyUnicode_New(size);
3142 if (v == NULL)
3143 goto onError;
3144 if (size == 0)
3145 return (PyObject *)v;
3146 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003147 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003148 if (PyUnicode_CheckExact(mapping)) {
3149 mapstring = PyUnicode_AS_UNICODE(mapping);
3150 maplen = PyUnicode_GET_SIZE(mapping);
3151 while (s < e) {
3152 unsigned char ch = *s;
3153 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003155 if (ch < maplen)
3156 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003158 if (x == 0xfffe) {
3159 /* undefined mapping */
3160 outpos = p-PyUnicode_AS_UNICODE(v);
3161 startinpos = s-starts;
3162 endinpos = startinpos+1;
3163 if (unicode_decode_call_errorhandler(
3164 errors, &errorHandler,
3165 "charmap", "character maps to <undefined>",
3166 starts, size, &startinpos, &endinpos, &exc, &s,
3167 (PyObject **)&v, &outpos, &p)) {
3168 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003169 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003170 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003171 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003172 *p++ = x;
3173 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003174 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003175 }
3176 else {
3177 while (s < e) {
3178 unsigned char ch = *s;
3179 PyObject *w, *x;
3180
3181 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3182 w = PyInt_FromLong((long)ch);
3183 if (w == NULL)
3184 goto onError;
3185 x = PyObject_GetItem(mapping, w);
3186 Py_DECREF(w);
3187 if (x == NULL) {
3188 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3189 /* No mapping found means: mapping is undefined. */
3190 PyErr_Clear();
3191 x = Py_None;
3192 Py_INCREF(x);
3193 } else
3194 goto onError;
3195 }
3196
3197 /* Apply mapping */
3198 if (PyInt_Check(x)) {
3199 long value = PyInt_AS_LONG(x);
3200 if (value < 0 || value > 65535) {
3201 PyErr_SetString(PyExc_TypeError,
3202 "character mapping must be in range(65536)");
3203 Py_DECREF(x);
3204 goto onError;
3205 }
3206 *p++ = (Py_UNICODE)value;
3207 }
3208 else if (x == Py_None) {
3209 /* undefined mapping */
3210 outpos = p-PyUnicode_AS_UNICODE(v);
3211 startinpos = s-starts;
3212 endinpos = startinpos+1;
3213 if (unicode_decode_call_errorhandler(
3214 errors, &errorHandler,
3215 "charmap", "character maps to <undefined>",
3216 starts, size, &startinpos, &endinpos, &exc, &s,
3217 (PyObject **)&v, &outpos, &p)) {
3218 Py_DECREF(x);
3219 goto onError;
3220 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003221 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003222 continue;
3223 }
3224 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003225 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003226
3227 if (targetsize == 1)
3228 /* 1-1 mapping */
3229 *p++ = *PyUnicode_AS_UNICODE(x);
3230
3231 else if (targetsize > 1) {
3232 /* 1-n mapping */
3233 if (targetsize > extrachars) {
3234 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003235 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3236 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003237 (targetsize << 2);
3238 extrachars += needed;
Armin Rigo4b63c212006-10-04 11:44:06 +00003239 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003240 if (_PyUnicode_Resize(&v,
3241 PyUnicode_GET_SIZE(v) + needed) < 0) {
3242 Py_DECREF(x);
3243 goto onError;
3244 }
3245 p = PyUnicode_AS_UNICODE(v) + oldpos;
3246 }
3247 Py_UNICODE_COPY(p,
3248 PyUnicode_AS_UNICODE(x),
3249 targetsize);
3250 p += targetsize;
3251 extrachars -= targetsize;
3252 }
3253 /* 1-0 mapping: skip the character */
3254 }
3255 else {
3256 /* wrong return value */
3257 PyErr_SetString(PyExc_TypeError,
3258 "character mapping must return integer, None or unicode");
3259 Py_DECREF(x);
3260 goto onError;
3261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003263 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 }
3266 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003267 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003269 Py_XDECREF(errorHandler);
3270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003272
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003274 Py_XDECREF(errorHandler);
3275 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276 Py_XDECREF(v);
3277 return NULL;
3278}
3279
Martin v. Löwis3f767792006-06-04 19:36:28 +00003280/* Charmap encoding: the lookup table */
3281
3282struct encoding_map{
3283 PyObject_HEAD
3284 unsigned char level1[32];
3285 int count2, count3;
3286 unsigned char level23[1];
3287};
3288
3289static PyObject*
3290encoding_map_size(PyObject *obj, PyObject* args)
3291{
3292 struct encoding_map *map = (struct encoding_map*)obj;
3293 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3294 128*map->count3);
3295}
3296
3297static PyMethodDef encoding_map_methods[] = {
3298 {"size", encoding_map_size, METH_NOARGS,
3299 PyDoc_STR("Return the size (in bytes) of this object") },
3300 { 0 }
3301};
3302
3303static void
3304encoding_map_dealloc(PyObject* o)
3305{
3306 PyObject_FREE(o);
3307}
3308
3309static PyTypeObject EncodingMapType = {
3310 PyObject_HEAD_INIT(NULL)
3311 0, /*ob_size*/
3312 "EncodingMap", /*tp_name*/
3313 sizeof(struct encoding_map), /*tp_basicsize*/
3314 0, /*tp_itemsize*/
3315 /* methods */
3316 encoding_map_dealloc, /*tp_dealloc*/
3317 0, /*tp_print*/
3318 0, /*tp_getattr*/
3319 0, /*tp_setattr*/
3320 0, /*tp_compare*/
3321 0, /*tp_repr*/
3322 0, /*tp_as_number*/
3323 0, /*tp_as_sequence*/
3324 0, /*tp_as_mapping*/
3325 0, /*tp_hash*/
3326 0, /*tp_call*/
3327 0, /*tp_str*/
3328 0, /*tp_getattro*/
3329 0, /*tp_setattro*/
3330 0, /*tp_as_buffer*/
3331 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3332 0, /*tp_doc*/
3333 0, /*tp_traverse*/
3334 0, /*tp_clear*/
3335 0, /*tp_richcompare*/
3336 0, /*tp_weaklistoffset*/
3337 0, /*tp_iter*/
3338 0, /*tp_iternext*/
3339 encoding_map_methods, /*tp_methods*/
3340 0, /*tp_members*/
3341 0, /*tp_getset*/
3342 0, /*tp_base*/
3343 0, /*tp_dict*/
3344 0, /*tp_descr_get*/
3345 0, /*tp_descr_set*/
3346 0, /*tp_dictoffset*/
3347 0, /*tp_init*/
3348 0, /*tp_alloc*/
3349 0, /*tp_new*/
3350 0, /*tp_free*/
3351 0, /*tp_is_gc*/
3352};
3353
3354PyObject*
3355PyUnicode_BuildEncodingMap(PyObject* string)
3356{
3357 Py_UNICODE *decode;
3358 PyObject *result;
3359 struct encoding_map *mresult;
3360 int i;
3361 int need_dict = 0;
3362 unsigned char level1[32];
3363 unsigned char level2[512];
3364 unsigned char *mlevel1, *mlevel2, *mlevel3;
3365 int count2 = 0, count3 = 0;
3366
3367 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3368 PyErr_BadArgument();
3369 return NULL;
3370 }
3371 decode = PyUnicode_AS_UNICODE(string);
3372 memset(level1, 0xFF, sizeof level1);
3373 memset(level2, 0xFF, sizeof level2);
3374
3375 /* If there isn't a one-to-one mapping of NULL to \0,
3376 or if there are non-BMP characters, we need to use
3377 a mapping dictionary. */
3378 if (decode[0] != 0)
3379 need_dict = 1;
3380 for (i = 1; i < 256; i++) {
3381 int l1, l2;
3382 if (decode[i] == 0
3383 #ifdef Py_UNICODE_WIDE
3384 || decode[i] > 0xFFFF
3385 #endif
3386 ) {
3387 need_dict = 1;
3388 break;
3389 }
3390 if (decode[i] == 0xFFFE)
3391 /* unmapped character */
3392 continue;
3393 l1 = decode[i] >> 11;
3394 l2 = decode[i] >> 7;
3395 if (level1[l1] == 0xFF)
3396 level1[l1] = count2++;
3397 if (level2[l2] == 0xFF)
3398 level2[l2] = count3++;
3399 }
3400
3401 if (count2 >= 0xFF || count3 >= 0xFF)
3402 need_dict = 1;
3403
3404 if (need_dict) {
3405 PyObject *result = PyDict_New();
3406 PyObject *key, *value;
3407 if (!result)
3408 return NULL;
3409 for (i = 0; i < 256; i++) {
3410 key = value = NULL;
3411 key = PyInt_FromLong(decode[i]);
3412 value = PyInt_FromLong(i);
3413 if (!key || !value)
3414 goto failed1;
3415 if (PyDict_SetItem(result, key, value) == -1)
3416 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003417 Py_DECREF(key);
3418 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003419 }
3420 return result;
3421 failed1:
3422 Py_XDECREF(key);
3423 Py_XDECREF(value);
3424 Py_DECREF(result);
3425 return NULL;
3426 }
3427
3428 /* Create a three-level trie */
3429 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3430 16*count2 + 128*count3 - 1);
3431 if (!result)
3432 return PyErr_NoMemory();
3433 PyObject_Init(result, &EncodingMapType);
3434 mresult = (struct encoding_map*)result;
3435 mresult->count2 = count2;
3436 mresult->count3 = count3;
3437 mlevel1 = mresult->level1;
3438 mlevel2 = mresult->level23;
3439 mlevel3 = mresult->level23 + 16*count2;
3440 memcpy(mlevel1, level1, 32);
3441 memset(mlevel2, 0xFF, 16*count2);
3442 memset(mlevel3, 0, 128*count3);
3443 count3 = 0;
3444 for (i = 1; i < 256; i++) {
3445 int o1, o2, o3, i2, i3;
3446 if (decode[i] == 0xFFFE)
3447 /* unmapped character */
3448 continue;
3449 o1 = decode[i]>>11;
3450 o2 = (decode[i]>>7) & 0xF;
3451 i2 = 16*mlevel1[o1] + o2;
3452 if (mlevel2[i2] == 0xFF)
3453 mlevel2[i2] = count3++;
3454 o3 = decode[i] & 0x7F;
3455 i3 = 128*mlevel2[i2] + o3;
3456 mlevel3[i3] = i;
3457 }
3458 return result;
3459}
3460
3461static int
3462encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3463{
3464 struct encoding_map *map = (struct encoding_map*)mapping;
3465 int l1 = c>>11;
3466 int l2 = (c>>7) & 0xF;
3467 int l3 = c & 0x7F;
3468 int i;
3469
3470#ifdef Py_UNICODE_WIDE
3471 if (c > 0xFFFF) {
3472 return -1;
3473 }
3474#endif
3475 if (c == 0)
3476 return 0;
3477 /* level 1*/
3478 i = map->level1[l1];
3479 if (i == 0xFF) {
3480 return -1;
3481 }
3482 /* level 2*/
3483 i = map->level23[16*i+l2];
3484 if (i == 0xFF) {
3485 return -1;
3486 }
3487 /* level 3 */
3488 i = map->level23[16*map->count2 + 128*i + l3];
3489 if (i == 0) {
3490 return -1;
3491 }
3492 return i;
3493}
3494
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495/* Lookup the character ch in the mapping. If the character
3496 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003497 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500 PyObject *w = PyInt_FromLong((long)c);
3501 PyObject *x;
3502
3503 if (w == NULL)
3504 return NULL;
3505 x = PyObject_GetItem(mapping, w);
3506 Py_DECREF(w);
3507 if (x == NULL) {
3508 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3509 /* No mapping found means: mapping is undefined. */
3510 PyErr_Clear();
3511 x = Py_None;
3512 Py_INCREF(x);
3513 return x;
3514 } else
3515 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003517 else if (x == Py_None)
3518 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519 else if (PyInt_Check(x)) {
3520 long value = PyInt_AS_LONG(x);
3521 if (value < 0 || value > 255) {
3522 PyErr_SetString(PyExc_TypeError,
3523 "character mapping must be in range(256)");
3524 Py_DECREF(x);
3525 return NULL;
3526 }
3527 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003528 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 else if (PyString_Check(x))
3530 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 /* wrong return value */
3533 PyErr_SetString(PyExc_TypeError,
3534 "character mapping must return integer, None or str");
3535 Py_DECREF(x);
3536 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 }
3538}
3539
Martin v. Löwis3f767792006-06-04 19:36:28 +00003540static int
3541charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3542{
3543 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3544 /* exponentially overallocate to minimize reallocations */
3545 if (requiredsize < 2*outsize)
3546 requiredsize = 2*outsize;
3547 if (_PyString_Resize(outobj, requiredsize)) {
3548 return 0;
3549 }
3550 return 1;
3551}
3552
3553typedef enum charmapencode_result {
3554 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3555}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556/* lookup the character, put the result in the output string and adjust
3557 various state variables. Reallocate the output string if not enough
3558 space is available. Return a new reference to the object that
3559 was put in the output buffer, or Py_None, if the mapping was undefined
3560 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003561 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003563charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003564 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003566 PyObject *rep;
3567 char *outstart;
3568 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569
Martin v. Löwis3f767792006-06-04 19:36:28 +00003570 if (mapping->ob_type == &EncodingMapType) {
3571 int res = encoding_map_lookup(c, mapping);
3572 Py_ssize_t requiredsize = *outpos+1;
3573 if (res == -1)
3574 return enc_FAILED;
3575 if (outsize<requiredsize)
3576 if (!charmapencode_resize(outobj, outpos, requiredsize))
3577 return enc_EXCEPTION;
3578 outstart = PyString_AS_STRING(*outobj);
3579 outstart[(*outpos)++] = (char)res;
3580 return enc_SUCCESS;
3581 }
3582
3583 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003585 return enc_EXCEPTION;
3586 else if (rep==Py_None) {
3587 Py_DECREF(rep);
3588 return enc_FAILED;
3589 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003591 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003592 if (outsize<requiredsize)
3593 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003595 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003597 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3599 }
3600 else {
3601 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003602 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3603 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003604 if (outsize<requiredsize)
3605 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003607 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003608 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003609 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 memcpy(outstart + *outpos, repchars, repsize);
3611 *outpos += repsize;
3612 }
3613 }
Georg Brandl9f167602006-06-04 21:46:16 +00003614 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003615 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003616}
3617
3618/* handle an error in PyUnicode_EncodeCharmap
3619 Return 0 on success, -1 on error */
3620static
3621int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003622 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003624 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003625 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003626{
3627 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003628 Py_ssize_t repsize;
3629 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003630 Py_UNICODE *uni2;
3631 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003632 Py_ssize_t collstartpos = *inpos;
3633 Py_ssize_t collendpos = *inpos+1;
3634 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 char *encoding = "charmap";
3636 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003637 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 /* find all unencodable characters */
3640 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003641 PyObject *rep;
3642 if (mapping->ob_type == &EncodingMapType) {
3643 int res = encoding_map_lookup(p[collendpos], mapping);
3644 if (res != -1)
3645 break;
3646 ++collendpos;
3647 continue;
3648 }
3649
3650 rep = charmapencode_lookup(p[collendpos], mapping);
3651 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003652 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003653 else if (rep!=Py_None) {
3654 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 break;
3656 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003657 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 ++collendpos;
3659 }
3660 /* cache callback name lookup
3661 * (if not done yet, i.e. it's the first error) */
3662 if (*known_errorHandler==-1) {
3663 if ((errors==NULL) || (!strcmp(errors, "strict")))
3664 *known_errorHandler = 1;
3665 else if (!strcmp(errors, "replace"))
3666 *known_errorHandler = 2;
3667 else if (!strcmp(errors, "ignore"))
3668 *known_errorHandler = 3;
3669 else if (!strcmp(errors, "xmlcharrefreplace"))
3670 *known_errorHandler = 4;
3671 else
3672 *known_errorHandler = 0;
3673 }
3674 switch (*known_errorHandler) {
3675 case 1: /* strict */
3676 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3677 return -1;
3678 case 2: /* replace */
3679 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3680 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003681 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 return -1;
3683 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003684 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3686 return -1;
3687 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 }
3689 /* fall through */
3690 case 3: /* ignore */
3691 *inpos = collendpos;
3692 break;
3693 case 4: /* xmlcharrefreplace */
3694 /* generate replacement (temporarily (mis)uses p) */
3695 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3696 char buffer[2+29+1+1];
3697 char *cp;
3698 sprintf(buffer, "&#%d;", (int)p[collpos]);
3699 for (cp = buffer; *cp; ++cp) {
3700 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003701 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003703 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003704 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3705 return -1;
3706 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 }
3708 }
3709 *inpos = collendpos;
3710 break;
3711 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003712 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 encoding, reason, p, size, exceptionObject,
3714 collstartpos, collendpos, &newpos);
3715 if (repunicode == NULL)
3716 return -1;
3717 /* generate replacement */
3718 repsize = PyUnicode_GET_SIZE(repunicode);
3719 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3720 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003721 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 return -1;
3723 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003724 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3727 return -1;
3728 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 }
3730 *inpos = newpos;
3731 Py_DECREF(repunicode);
3732 }
3733 return 0;
3734}
3735
Guido van Rossumd57fd912000-03-10 22:53:23 +00003736PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003737 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 PyObject *mapping,
3739 const char *errors)
3740{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003741 /* output object */
3742 PyObject *res = NULL;
3743 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003744 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003745 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003746 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 PyObject *errorHandler = NULL;
3748 PyObject *exc = NULL;
3749 /* the following variable is used for caching string comparisons
3750 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3751 * 3=ignore, 4=xmlcharrefreplace */
3752 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753
3754 /* Default to Latin-1 */
3755 if (mapping == NULL)
3756 return PyUnicode_EncodeLatin1(p, size, errors);
3757
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 /* allocate enough for a simple encoding without
3759 replacements, if we need more, we'll resize */
3760 res = PyString_FromStringAndSize(NULL, size);
3761 if (res == NULL)
3762 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003763 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003766 while (inpos<size) {
3767 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003768 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3769 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003771 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772 if (charmap_encoding_error(p, size, &inpos, mapping,
3773 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003774 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003775 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003776 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003777 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003779 else
3780 /* done with this character => adjust input position */
3781 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784 /* Resize if we allocated to much */
3785 if (respos<PyString_GET_SIZE(res)) {
3786 if (_PyString_Resize(&res, respos))
3787 goto onError;
3788 }
3789 Py_XDECREF(exc);
3790 Py_XDECREF(errorHandler);
3791 return res;
3792
3793 onError:
3794 Py_XDECREF(res);
3795 Py_XDECREF(exc);
3796 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 return NULL;
3798}
3799
3800PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3801 PyObject *mapping)
3802{
3803 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3804 PyErr_BadArgument();
3805 return NULL;
3806 }
3807 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3808 PyUnicode_GET_SIZE(unicode),
3809 mapping,
3810 NULL);
3811}
3812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003813/* create or adjust a UnicodeTranslateError */
3814static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003815 const Py_UNICODE *unicode, Py_ssize_t size,
3816 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003817 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003819 if (*exceptionObject == NULL) {
3820 *exceptionObject = PyUnicodeTranslateError_Create(
3821 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822 }
3823 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003824 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3825 goto onError;
3826 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3827 goto onError;
3828 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3829 goto onError;
3830 return;
3831 onError:
3832 Py_DECREF(*exceptionObject);
3833 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 }
3835}
3836
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003837/* raises a UnicodeTranslateError */
3838static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003839 const Py_UNICODE *unicode, Py_ssize_t size,
3840 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003841 const char *reason)
3842{
3843 make_translate_exception(exceptionObject,
3844 unicode, size, startpos, endpos, reason);
3845 if (*exceptionObject != NULL)
3846 PyCodec_StrictErrors(*exceptionObject);
3847}
3848
3849/* error handling callback helper:
3850 build arguments, call the callback and check the arguments,
3851 put the result into newpos and return the replacement string, which
3852 has to be freed by the caller */
3853static PyObject *unicode_translate_call_errorhandler(const char *errors,
3854 PyObject **errorHandler,
3855 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003856 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3857 Py_ssize_t startpos, Py_ssize_t endpos,
3858 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003859{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003860 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861
Martin v. Löwis412fb672006-04-13 06:34:32 +00003862 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003863 PyObject *restuple;
3864 PyObject *resunicode;
3865
3866 if (*errorHandler == NULL) {
3867 *errorHandler = PyCodec_LookupError(errors);
3868 if (*errorHandler == NULL)
3869 return NULL;
3870 }
3871
3872 make_translate_exception(exceptionObject,
3873 unicode, size, startpos, endpos, reason);
3874 if (*exceptionObject == NULL)
3875 return NULL;
3876
3877 restuple = PyObject_CallFunctionObjArgs(
3878 *errorHandler, *exceptionObject, NULL);
3879 if (restuple == NULL)
3880 return NULL;
3881 if (!PyTuple_Check(restuple)) {
3882 PyErr_Format(PyExc_TypeError, &argparse[4]);
3883 Py_DECREF(restuple);
3884 return NULL;
3885 }
3886 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003887 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 Py_DECREF(restuple);
3889 return NULL;
3890 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003891 if (i_newpos<0)
3892 *newpos = size+i_newpos;
3893 else
3894 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003895 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003896 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003897 Py_DECREF(restuple);
3898 return NULL;
3899 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003900 Py_INCREF(resunicode);
3901 Py_DECREF(restuple);
3902 return resunicode;
3903}
3904
3905/* Lookup the character ch in the mapping and put the result in result,
3906 which must be decrefed by the caller.
3907 Return 0 on success, -1 on error */
3908static
3909int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3910{
3911 PyObject *w = PyInt_FromLong((long)c);
3912 PyObject *x;
3913
3914 if (w == NULL)
3915 return -1;
3916 x = PyObject_GetItem(mapping, w);
3917 Py_DECREF(w);
3918 if (x == NULL) {
3919 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3920 /* No mapping found means: use 1:1 mapping. */
3921 PyErr_Clear();
3922 *result = NULL;
3923 return 0;
3924 } else
3925 return -1;
3926 }
3927 else if (x == Py_None) {
3928 *result = x;
3929 return 0;
3930 }
3931 else if (PyInt_Check(x)) {
3932 long value = PyInt_AS_LONG(x);
3933 long max = PyUnicode_GetMax();
3934 if (value < 0 || value > max) {
3935 PyErr_Format(PyExc_TypeError,
3936 "character mapping must be in range(0x%lx)", max+1);
3937 Py_DECREF(x);
3938 return -1;
3939 }
3940 *result = x;
3941 return 0;
3942 }
3943 else if (PyUnicode_Check(x)) {
3944 *result = x;
3945 return 0;
3946 }
3947 else {
3948 /* wrong return value */
3949 PyErr_SetString(PyExc_TypeError,
3950 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003951 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 return -1;
3953 }
3954}
3955/* ensure that *outobj is at least requiredsize characters long,
3956if not reallocate and adjust various state variables.
3957Return 0 on success, -1 on error */
3958static
Walter Dörwald4894c302003-10-24 14:25:28 +00003959int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003960 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003962 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003963 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003965 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003967 if (requiredsize < 2 * oldsize)
3968 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003969 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003970 return -1;
3971 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972 }
3973 return 0;
3974}
3975/* lookup the character, put the result in the output string and adjust
3976 various state variables. Return a new reference to the object that
3977 was put in the output buffer in *result, or Py_None, if the mapping was
3978 undefined (in which case no character was written).
3979 The called must decref result.
3980 Return 0 on success, -1 on error. */
3981static
Walter Dörwald4894c302003-10-24 14:25:28 +00003982int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003983 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003984 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985{
Walter Dörwald4894c302003-10-24 14:25:28 +00003986 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987 return -1;
3988 if (*res==NULL) {
3989 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003990 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991 }
3992 else if (*res==Py_None)
3993 ;
3994 else if (PyInt_Check(*res)) {
3995 /* no overflow check, because we know that the space is enough */
3996 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3997 }
3998 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003999 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 if (repsize==1) {
4001 /* no overflow check, because we know that the space is enough */
4002 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4003 }
4004 else if (repsize!=0) {
4005 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004006 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004007 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004008 repsize - 1;
4009 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004010 return -1;
4011 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4012 *outp += repsize;
4013 }
4014 }
4015 else
4016 return -1;
4017 return 0;
4018}
4019
4020PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004021 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022 PyObject *mapping,
4023 const char *errors)
4024{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025 /* output object */
4026 PyObject *res = NULL;
4027 /* pointers to the beginning and end+1 of input */
4028 const Py_UNICODE *startp = p;
4029 const Py_UNICODE *endp = p + size;
4030 /* pointer into the output */
4031 Py_UNICODE *str;
4032 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004033 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004034 char *reason = "character maps to <undefined>";
4035 PyObject *errorHandler = NULL;
4036 PyObject *exc = NULL;
4037 /* the following variable is used for caching string comparisons
4038 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4039 * 3=ignore, 4=xmlcharrefreplace */
4040 int known_errorHandler = -1;
4041
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042 if (mapping == NULL) {
4043 PyErr_BadArgument();
4044 return NULL;
4045 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004046
4047 /* allocate enough for a simple 1:1 translation without
4048 replacements, if we need more, we'll resize */
4049 res = PyUnicode_FromUnicode(NULL, size);
4050 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004051 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053 return res;
4054 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056 while (p<endp) {
4057 /* try to encode it */
4058 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004059 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 goto onError;
4062 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004063 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 if (x!=Py_None) /* it worked => adjust input pointer */
4065 ++p;
4066 else { /* untranslatable character */
4067 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004068 Py_ssize_t repsize;
4069 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070 Py_UNICODE *uni2;
4071 /* startpos for collecting untranslatable chars */
4072 const Py_UNICODE *collstart = p;
4073 const Py_UNICODE *collend = p+1;
4074 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076 /* find all untranslatable characters */
4077 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004078 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 goto onError;
4080 Py_XDECREF(x);
4081 if (x!=Py_None)
4082 break;
4083 ++collend;
4084 }
4085 /* cache callback name lookup
4086 * (if not done yet, i.e. it's the first error) */
4087 if (known_errorHandler==-1) {
4088 if ((errors==NULL) || (!strcmp(errors, "strict")))
4089 known_errorHandler = 1;
4090 else if (!strcmp(errors, "replace"))
4091 known_errorHandler = 2;
4092 else if (!strcmp(errors, "ignore"))
4093 known_errorHandler = 3;
4094 else if (!strcmp(errors, "xmlcharrefreplace"))
4095 known_errorHandler = 4;
4096 else
4097 known_errorHandler = 0;
4098 }
4099 switch (known_errorHandler) {
4100 case 1: /* strict */
4101 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4102 goto onError;
4103 case 2: /* replace */
4104 /* No need to check for space, this is a 1:1 replacement */
4105 for (coll = collstart; coll<collend; ++coll)
4106 *str++ = '?';
4107 /* fall through */
4108 case 3: /* ignore */
4109 p = collend;
4110 break;
4111 case 4: /* xmlcharrefreplace */
4112 /* generate replacement (temporarily (mis)uses p) */
4113 for (p = collstart; p < collend; ++p) {
4114 char buffer[2+29+1+1];
4115 char *cp;
4116 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004117 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004118 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4119 goto onError;
4120 for (cp = buffer; *cp; ++cp)
4121 *str++ = *cp;
4122 }
4123 p = collend;
4124 break;
4125 default:
4126 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4127 reason, startp, size, &exc,
4128 collstart-startp, collend-startp, &newpos);
4129 if (repunicode == NULL)
4130 goto onError;
4131 /* generate replacement */
4132 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004133 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4135 Py_DECREF(repunicode);
4136 goto onError;
4137 }
4138 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4139 *str++ = *uni2;
4140 p = startp + newpos;
4141 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 }
4143 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 /* Resize if we allocated to much */
4146 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004147 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004148 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004149 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 }
4151 Py_XDECREF(exc);
4152 Py_XDECREF(errorHandler);
4153 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155 onError:
4156 Py_XDECREF(res);
4157 Py_XDECREF(exc);
4158 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 return NULL;
4160}
4161
4162PyObject *PyUnicode_Translate(PyObject *str,
4163 PyObject *mapping,
4164 const char *errors)
4165{
4166 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004167
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 str = PyUnicode_FromObject(str);
4169 if (str == NULL)
4170 goto onError;
4171 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4172 PyUnicode_GET_SIZE(str),
4173 mapping,
4174 errors);
4175 Py_DECREF(str);
4176 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004177
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178 onError:
4179 Py_XDECREF(str);
4180 return NULL;
4181}
Tim Petersced69f82003-09-16 20:30:58 +00004182
Guido van Rossum9e896b32000-04-05 20:11:21 +00004183/* --- Decimal Encoder ---------------------------------------------------- */
4184
4185int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004186 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004187 char *output,
4188 const char *errors)
4189{
4190 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 PyObject *errorHandler = NULL;
4192 PyObject *exc = NULL;
4193 const char *encoding = "decimal";
4194 const char *reason = "invalid decimal Unicode string";
4195 /* the following variable is used for caching string comparisons
4196 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4197 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004198
4199 if (output == NULL) {
4200 PyErr_BadArgument();
4201 return -1;
4202 }
4203
4204 p = s;
4205 end = s + length;
4206 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004208 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004210 Py_ssize_t repsize;
4211 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212 Py_UNICODE *uni2;
4213 Py_UNICODE *collstart;
4214 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004215
Guido van Rossum9e896b32000-04-05 20:11:21 +00004216 if (Py_UNICODE_ISSPACE(ch)) {
4217 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004219 continue;
4220 }
4221 decimal = Py_UNICODE_TODECIMAL(ch);
4222 if (decimal >= 0) {
4223 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004224 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004225 continue;
4226 }
Guido van Rossumba477042000-04-06 18:18:10 +00004227 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004228 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004229 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004230 continue;
4231 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 /* All other characters are considered unencodable */
4233 collstart = p;
4234 collend = p+1;
4235 while (collend < end) {
4236 if ((0 < *collend && *collend < 256) ||
4237 !Py_UNICODE_ISSPACE(*collend) ||
4238 Py_UNICODE_TODECIMAL(*collend))
4239 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004240 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241 /* cache callback name lookup
4242 * (if not done yet, i.e. it's the first error) */
4243 if (known_errorHandler==-1) {
4244 if ((errors==NULL) || (!strcmp(errors, "strict")))
4245 known_errorHandler = 1;
4246 else if (!strcmp(errors, "replace"))
4247 known_errorHandler = 2;
4248 else if (!strcmp(errors, "ignore"))
4249 known_errorHandler = 3;
4250 else if (!strcmp(errors, "xmlcharrefreplace"))
4251 known_errorHandler = 4;
4252 else
4253 known_errorHandler = 0;
4254 }
4255 switch (known_errorHandler) {
4256 case 1: /* strict */
4257 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4258 goto onError;
4259 case 2: /* replace */
4260 for (p = collstart; p < collend; ++p)
4261 *output++ = '?';
4262 /* fall through */
4263 case 3: /* ignore */
4264 p = collend;
4265 break;
4266 case 4: /* xmlcharrefreplace */
4267 /* generate replacement (temporarily (mis)uses p) */
4268 for (p = collstart; p < collend; ++p)
4269 output += sprintf(output, "&#%d;", (int)*p);
4270 p = collend;
4271 break;
4272 default:
4273 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4274 encoding, reason, s, length, &exc,
4275 collstart-s, collend-s, &newpos);
4276 if (repunicode == NULL)
4277 goto onError;
4278 /* generate replacement */
4279 repsize = PyUnicode_GET_SIZE(repunicode);
4280 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4281 Py_UNICODE ch = *uni2;
4282 if (Py_UNICODE_ISSPACE(ch))
4283 *output++ = ' ';
4284 else {
4285 decimal = Py_UNICODE_TODECIMAL(ch);
4286 if (decimal >= 0)
4287 *output++ = '0' + decimal;
4288 else if (0 < ch && ch < 256)
4289 *output++ = (char)ch;
4290 else {
4291 Py_DECREF(repunicode);
4292 raise_encode_exception(&exc, encoding,
4293 s, length, collstart-s, collend-s, reason);
4294 goto onError;
4295 }
4296 }
4297 }
4298 p = s + newpos;
4299 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004300 }
4301 }
4302 /* 0-terminate the output string */
4303 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304 Py_XDECREF(exc);
4305 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004306 return 0;
4307
4308 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004309 Py_XDECREF(exc);
4310 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004311 return -1;
4312}
4313
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314/* --- Helpers ------------------------------------------------------------ */
4315
Fredrik Lundha50d2012006-05-26 17:04:58 +00004316#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004317
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004318#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004319#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004320#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004321
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004322Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004323STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4324{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004325 if (str[0] != other[0])
4326 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004327 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4328}
4329
Fredrik Lundhb9479482006-05-26 17:22:38 +00004330#define STRINGLIB_EMPTY unicode_empty
4331
Fredrik Lundha50d2012006-05-26 17:04:58 +00004332#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004333
4334#include "stringlib/count.h"
4335#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004336#include "stringlib/partition.h"
4337
Fredrik Lundhc8162812006-05-26 19:33:03 +00004338/* helper macro to fixup start/end slice values */
4339#define FIX_START_END(obj) \
4340 if (start < 0) \
4341 start += (obj)->length; \
4342 if (start < 0) \
4343 start = 0; \
4344 if (end > (obj)->length) \
4345 end = (obj)->length; \
4346 if (end < 0) \
4347 end += (obj)->length; \
4348 if (end < 0) \
4349 end = 0;
4350
Martin v. Löwis18e16552006-02-15 17:27:45 +00004351Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004352 PyObject *substr,
4353 Py_ssize_t start,
4354 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004356 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004357 PyUnicodeObject* str_obj;
4358 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004359
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004360 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4361 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004363 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4364 if (!sub_obj) {
4365 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004366 return -1;
4367 }
Tim Petersced69f82003-09-16 20:30:58 +00004368
Fredrik Lundhc8162812006-05-26 19:33:03 +00004369 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004370
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004371 result = stringlib_count(
4372 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4373 );
4374
4375 Py_DECREF(sub_obj);
4376 Py_DECREF(str_obj);
4377
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378 return result;
4379}
4380
Martin v. Löwis18e16552006-02-15 17:27:45 +00004381Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004382 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004383 Py_ssize_t start,
4384 Py_ssize_t end,
4385 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004387 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004388
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004389 str = PyUnicode_FromObject(str);
4390 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004391 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004392 sub = PyUnicode_FromObject(sub);
4393 if (!sub) {
4394 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004395 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 }
Tim Petersced69f82003-09-16 20:30:58 +00004397
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004398 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004399 result = stringlib_find_slice(
4400 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4401 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4402 start, end
4403 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004404 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004405 result = stringlib_rfind_slice(
4406 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4407 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4408 start, end
4409 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004410
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004411 Py_DECREF(str);
4412 Py_DECREF(sub);
4413
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 return result;
4415}
4416
Tim Petersced69f82003-09-16 20:30:58 +00004417static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418int tailmatch(PyUnicodeObject *self,
4419 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004420 Py_ssize_t start,
4421 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422 int direction)
4423{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424 if (substring->length == 0)
4425 return 1;
4426
Fredrik Lundhc8162812006-05-26 19:33:03 +00004427 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428
4429 end -= substring->length;
4430 if (end < start)
4431 return 0;
4432
4433 if (direction > 0) {
4434 if (Py_UNICODE_MATCH(self, end, substring))
4435 return 1;
4436 } else {
4437 if (Py_UNICODE_MATCH(self, start, substring))
4438 return 1;
4439 }
4440
4441 return 0;
4442}
4443
Martin v. Löwis18e16552006-02-15 17:27:45 +00004444Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004446 Py_ssize_t start,
4447 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448 int direction)
4449{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004450 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004451
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 str = PyUnicode_FromObject(str);
4453 if (str == NULL)
4454 return -1;
4455 substr = PyUnicode_FromObject(substr);
4456 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004457 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 return -1;
4459 }
Tim Petersced69f82003-09-16 20:30:58 +00004460
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 result = tailmatch((PyUnicodeObject *)str,
4462 (PyUnicodeObject *)substr,
4463 start, end, direction);
4464 Py_DECREF(str);
4465 Py_DECREF(substr);
4466 return result;
4467}
4468
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469/* Apply fixfct filter to the Unicode object self and return a
4470 reference to the modified object */
4471
Tim Petersced69f82003-09-16 20:30:58 +00004472static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473PyObject *fixup(PyUnicodeObject *self,
4474 int (*fixfct)(PyUnicodeObject *s))
4475{
4476
4477 PyUnicodeObject *u;
4478
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004479 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 if (u == NULL)
4481 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004482
4483 Py_UNICODE_COPY(u->str, self->str, self->length);
4484
Tim Peters7a29bd52001-09-12 03:03:31 +00004485 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 /* fixfct should return TRUE if it modified the buffer. If
4487 FALSE, return a reference to the original buffer instead
4488 (to save space, not time) */
4489 Py_INCREF(self);
4490 Py_DECREF(u);
4491 return (PyObject*) self;
4492 }
4493 return (PyObject*) u;
4494}
4495
Tim Petersced69f82003-09-16 20:30:58 +00004496static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497int fixupper(PyUnicodeObject *self)
4498{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004499 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 Py_UNICODE *s = self->str;
4501 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004502
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503 while (len-- > 0) {
4504 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004505
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506 ch = Py_UNICODE_TOUPPER(*s);
4507 if (ch != *s) {
4508 status = 1;
4509 *s = ch;
4510 }
4511 s++;
4512 }
4513
4514 return status;
4515}
4516
Tim Petersced69f82003-09-16 20:30:58 +00004517static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518int fixlower(PyUnicodeObject *self)
4519{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004520 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521 Py_UNICODE *s = self->str;
4522 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004523
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 while (len-- > 0) {
4525 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004526
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527 ch = Py_UNICODE_TOLOWER(*s);
4528 if (ch != *s) {
4529 status = 1;
4530 *s = ch;
4531 }
4532 s++;
4533 }
4534
4535 return status;
4536}
4537
Tim Petersced69f82003-09-16 20:30:58 +00004538static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539int fixswapcase(PyUnicodeObject *self)
4540{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004541 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542 Py_UNICODE *s = self->str;
4543 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004544
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545 while (len-- > 0) {
4546 if (Py_UNICODE_ISUPPER(*s)) {
4547 *s = Py_UNICODE_TOLOWER(*s);
4548 status = 1;
4549 } else if (Py_UNICODE_ISLOWER(*s)) {
4550 *s = Py_UNICODE_TOUPPER(*s);
4551 status = 1;
4552 }
4553 s++;
4554 }
4555
4556 return status;
4557}
4558
Tim Petersced69f82003-09-16 20:30:58 +00004559static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560int fixcapitalize(PyUnicodeObject *self)
4561{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004562 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004563 Py_UNICODE *s = self->str;
4564 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004565
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004566 if (len == 0)
4567 return 0;
4568 if (Py_UNICODE_ISLOWER(*s)) {
4569 *s = Py_UNICODE_TOUPPER(*s);
4570 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004572 s++;
4573 while (--len > 0) {
4574 if (Py_UNICODE_ISUPPER(*s)) {
4575 *s = Py_UNICODE_TOLOWER(*s);
4576 status = 1;
4577 }
4578 s++;
4579 }
4580 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581}
4582
4583static
4584int fixtitle(PyUnicodeObject *self)
4585{
4586 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4587 register Py_UNICODE *e;
4588 int previous_is_cased;
4589
4590 /* Shortcut for single character strings */
4591 if (PyUnicode_GET_SIZE(self) == 1) {
4592 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4593 if (*p != ch) {
4594 *p = ch;
4595 return 1;
4596 }
4597 else
4598 return 0;
4599 }
Tim Petersced69f82003-09-16 20:30:58 +00004600
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601 e = p + PyUnicode_GET_SIZE(self);
4602 previous_is_cased = 0;
4603 for (; p < e; p++) {
4604 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004605
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606 if (previous_is_cased)
4607 *p = Py_UNICODE_TOLOWER(ch);
4608 else
4609 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004610
4611 if (Py_UNICODE_ISLOWER(ch) ||
4612 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613 Py_UNICODE_ISTITLE(ch))
4614 previous_is_cased = 1;
4615 else
4616 previous_is_cased = 0;
4617 }
4618 return 1;
4619}
4620
Tim Peters8ce9f162004-08-27 01:49:32 +00004621PyObject *
4622PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623{
Tim Peters8ce9f162004-08-27 01:49:32 +00004624 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004625 const Py_UNICODE blank = ' ';
4626 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004627 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004628 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004629 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4630 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004631 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4632 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004633 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004634 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004635 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636
Tim Peters05eba1f2004-08-27 21:32:02 +00004637 fseq = PySequence_Fast(seq, "");
4638 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004639 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004640 }
4641
Tim Peters91879ab2004-08-27 22:35:44 +00004642 /* Grrrr. A codec may be invoked to convert str objects to
4643 * Unicode, and so it's possible to call back into Python code
4644 * during PyUnicode_FromObject(), and so it's possible for a sick
4645 * codec to change the size of fseq (if seq is a list). Therefore
4646 * we have to keep refetching the size -- can't assume seqlen
4647 * is invariant.
4648 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004649 seqlen = PySequence_Fast_GET_SIZE(fseq);
4650 /* If empty sequence, return u"". */
4651 if (seqlen == 0) {
4652 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4653 goto Done;
4654 }
4655 /* If singleton sequence with an exact Unicode, return that. */
4656 if (seqlen == 1) {
4657 item = PySequence_Fast_GET_ITEM(fseq, 0);
4658 if (PyUnicode_CheckExact(item)) {
4659 Py_INCREF(item);
4660 res = (PyUnicodeObject *)item;
4661 goto Done;
4662 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004663 }
4664
Tim Peters05eba1f2004-08-27 21:32:02 +00004665 /* At least two items to join, or one that isn't exact Unicode. */
4666 if (seqlen > 1) {
4667 /* Set up sep and seplen -- they're needed. */
4668 if (separator == NULL) {
4669 sep = &blank;
4670 seplen = 1;
4671 }
4672 else {
4673 internal_separator = PyUnicode_FromObject(separator);
4674 if (internal_separator == NULL)
4675 goto onError;
4676 sep = PyUnicode_AS_UNICODE(internal_separator);
4677 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004678 /* In case PyUnicode_FromObject() mutated seq. */
4679 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004680 }
4681 }
4682
4683 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004684 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004685 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004686 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004687 res_p = PyUnicode_AS_UNICODE(res);
4688 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004689
Tim Peters05eba1f2004-08-27 21:32:02 +00004690 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004691 Py_ssize_t itemlen;
4692 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004693
4694 item = PySequence_Fast_GET_ITEM(fseq, i);
4695 /* Convert item to Unicode. */
4696 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4697 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004698 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004699 " %.80s found",
4700 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004701 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004702 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004703 item = PyUnicode_FromObject(item);
4704 if (item == NULL)
4705 goto onError;
4706 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004707
Tim Peters91879ab2004-08-27 22:35:44 +00004708 /* In case PyUnicode_FromObject() mutated seq. */
4709 seqlen = PySequence_Fast_GET_SIZE(fseq);
4710
Tim Peters8ce9f162004-08-27 01:49:32 +00004711 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004713 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004714 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004715 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004716 if (i < seqlen - 1) {
4717 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004718 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004719 goto Overflow;
4720 }
4721 if (new_res_used > res_alloc) {
4722 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004723 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004724 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004725 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004726 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004727 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004728 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004729 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004731 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004732 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004734
4735 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004736 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004737 res_p += itemlen;
4738 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004739 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004740 res_p += seplen;
4741 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004743 res_used = new_res_used;
4744 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004745
Tim Peters05eba1f2004-08-27 21:32:02 +00004746 /* Shrink res to match the used area; this probably can't fail,
4747 * but it's cheap to check.
4748 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004749 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004750 goto onError;
4751
4752 Done:
4753 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004754 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 return (PyObject *)res;
4756
Tim Peters8ce9f162004-08-27 01:49:32 +00004757 Overflow:
4758 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004759 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004760 Py_DECREF(item);
4761 /* fall through */
4762
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004764 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004765 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004766 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 return NULL;
4768}
4769
Tim Petersced69f82003-09-16 20:30:58 +00004770static
4771PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004772 Py_ssize_t left,
4773 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 Py_UNICODE fill)
4775{
4776 PyUnicodeObject *u;
4777
4778 if (left < 0)
4779 left = 0;
4780 if (right < 0)
4781 right = 0;
4782
Tim Peters7a29bd52001-09-12 03:03:31 +00004783 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 Py_INCREF(self);
4785 return self;
4786 }
4787
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00004788 if (left > PY_SSIZE_T_MAX - self->length ||
4789 right > PY_SSIZE_T_MAX - (left + self->length)) {
4790 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
4791 return NULL;
4792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 u = _PyUnicode_New(left + self->length + right);
4794 if (u) {
4795 if (left)
4796 Py_UNICODE_FILL(u->str, fill, left);
4797 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4798 if (right)
4799 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4800 }
4801
4802 return u;
4803}
4804
4805#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004806 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 if (!str) \
4808 goto onError; \
4809 if (PyList_Append(list, str)) { \
4810 Py_DECREF(str); \
4811 goto onError; \
4812 } \
4813 else \
4814 Py_DECREF(str);
4815
4816static
4817PyObject *split_whitespace(PyUnicodeObject *self,
4818 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004819 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004821 register Py_ssize_t i;
4822 register Py_ssize_t j;
4823 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 PyObject *str;
4825
4826 for (i = j = 0; i < len; ) {
4827 /* find a token */
4828 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4829 i++;
4830 j = i;
4831 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4832 i++;
4833 if (j < i) {
4834 if (maxcount-- <= 0)
4835 break;
4836 SPLIT_APPEND(self->str, j, i);
4837 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4838 i++;
4839 j = i;
4840 }
4841 }
4842 if (j < len) {
4843 SPLIT_APPEND(self->str, j, len);
4844 }
4845 return list;
4846
4847 onError:
4848 Py_DECREF(list);
4849 return NULL;
4850}
4851
4852PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004853 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004855 register Py_ssize_t i;
4856 register Py_ssize_t j;
4857 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858 PyObject *list;
4859 PyObject *str;
4860 Py_UNICODE *data;
4861
4862 string = PyUnicode_FromObject(string);
4863 if (string == NULL)
4864 return NULL;
4865 data = PyUnicode_AS_UNICODE(string);
4866 len = PyUnicode_GET_SIZE(string);
4867
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 list = PyList_New(0);
4869 if (!list)
4870 goto onError;
4871
4872 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004873 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004874
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004876 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878
4879 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004880 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881 if (i < len) {
4882 if (data[i] == '\r' && i + 1 < len &&
4883 data[i+1] == '\n')
4884 i += 2;
4885 else
4886 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004887 if (keepends)
4888 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 }
Guido van Rossum86662912000-04-11 15:38:46 +00004890 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 j = i;
4892 }
4893 if (j < len) {
4894 SPLIT_APPEND(data, j, len);
4895 }
4896
4897 Py_DECREF(string);
4898 return list;
4899
4900 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004901 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902 Py_DECREF(string);
4903 return NULL;
4904}
4905
Tim Petersced69f82003-09-16 20:30:58 +00004906static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907PyObject *split_char(PyUnicodeObject *self,
4908 PyObject *list,
4909 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004910 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004912 register Py_ssize_t i;
4913 register Py_ssize_t j;
4914 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915 PyObject *str;
4916
4917 for (i = j = 0; i < len; ) {
4918 if (self->str[i] == ch) {
4919 if (maxcount-- <= 0)
4920 break;
4921 SPLIT_APPEND(self->str, j, i);
4922 i = j = i + 1;
4923 } else
4924 i++;
4925 }
4926 if (j <= len) {
4927 SPLIT_APPEND(self->str, j, len);
4928 }
4929 return list;
4930
4931 onError:
4932 Py_DECREF(list);
4933 return NULL;
4934}
4935
Tim Petersced69f82003-09-16 20:30:58 +00004936static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937PyObject *split_substring(PyUnicodeObject *self,
4938 PyObject *list,
4939 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004940 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004942 register Py_ssize_t i;
4943 register Py_ssize_t j;
4944 Py_ssize_t len = self->length;
4945 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946 PyObject *str;
4947
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004948 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949 if (Py_UNICODE_MATCH(self, i, substring)) {
4950 if (maxcount-- <= 0)
4951 break;
4952 SPLIT_APPEND(self->str, j, i);
4953 i = j = i + sublen;
4954 } else
4955 i++;
4956 }
4957 if (j <= len) {
4958 SPLIT_APPEND(self->str, j, len);
4959 }
4960 return list;
4961
4962 onError:
4963 Py_DECREF(list);
4964 return NULL;
4965}
4966
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004967static
4968PyObject *rsplit_whitespace(PyUnicodeObject *self,
4969 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004970 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004971{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004972 register Py_ssize_t i;
4973 register Py_ssize_t j;
4974 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004975 PyObject *str;
4976
4977 for (i = j = len - 1; i >= 0; ) {
4978 /* find a token */
4979 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4980 i--;
4981 j = i;
4982 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4983 i--;
4984 if (j > i) {
4985 if (maxcount-- <= 0)
4986 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004987 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004988 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4989 i--;
4990 j = i;
4991 }
4992 }
4993 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004994 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004995 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004996 if (PyList_Reverse(list) < 0)
4997 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004998 return list;
4999
5000 onError:
5001 Py_DECREF(list);
5002 return NULL;
5003}
5004
5005static
5006PyObject *rsplit_char(PyUnicodeObject *self,
5007 PyObject *list,
5008 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005009 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005010{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005011 register Py_ssize_t i;
5012 register Py_ssize_t j;
5013 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005014 PyObject *str;
5015
5016 for (i = j = len - 1; i >= 0; ) {
5017 if (self->str[i] == ch) {
5018 if (maxcount-- <= 0)
5019 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005020 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005021 j = i = i - 1;
5022 } else
5023 i--;
5024 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005025 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005026 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005027 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005028 if (PyList_Reverse(list) < 0)
5029 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005030 return list;
5031
5032 onError:
5033 Py_DECREF(list);
5034 return NULL;
5035}
5036
5037static
5038PyObject *rsplit_substring(PyUnicodeObject *self,
5039 PyObject *list,
5040 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005041 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005042{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005043 register Py_ssize_t i;
5044 register Py_ssize_t j;
5045 Py_ssize_t len = self->length;
5046 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005047 PyObject *str;
5048
5049 for (i = len - sublen, j = len; i >= 0; ) {
5050 if (Py_UNICODE_MATCH(self, i, substring)) {
5051 if (maxcount-- <= 0)
5052 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005053 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005054 j = i;
5055 i -= sublen;
5056 } else
5057 i--;
5058 }
5059 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005060 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005061 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005062 if (PyList_Reverse(list) < 0)
5063 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005064 return list;
5065
5066 onError:
5067 Py_DECREF(list);
5068 return NULL;
5069}
5070
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071#undef SPLIT_APPEND
5072
5073static
5074PyObject *split(PyUnicodeObject *self,
5075 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005076 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077{
5078 PyObject *list;
5079
5080 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005081 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082
5083 list = PyList_New(0);
5084 if (!list)
5085 return NULL;
5086
5087 if (substring == NULL)
5088 return split_whitespace(self,list,maxcount);
5089
5090 else if (substring->length == 1)
5091 return split_char(self,list,substring->str[0],maxcount);
5092
5093 else if (substring->length == 0) {
5094 Py_DECREF(list);
5095 PyErr_SetString(PyExc_ValueError, "empty separator");
5096 return NULL;
5097 }
5098 else
5099 return split_substring(self,list,substring,maxcount);
5100}
5101
Tim Petersced69f82003-09-16 20:30:58 +00005102static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005103PyObject *rsplit(PyUnicodeObject *self,
5104 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005105 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005106{
5107 PyObject *list;
5108
5109 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005110 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005111
5112 list = PyList_New(0);
5113 if (!list)
5114 return NULL;
5115
5116 if (substring == NULL)
5117 return rsplit_whitespace(self,list,maxcount);
5118
5119 else if (substring->length == 1)
5120 return rsplit_char(self,list,substring->str[0],maxcount);
5121
5122 else if (substring->length == 0) {
5123 Py_DECREF(list);
5124 PyErr_SetString(PyExc_ValueError, "empty separator");
5125 return NULL;
5126 }
5127 else
5128 return rsplit_substring(self,list,substring,maxcount);
5129}
5130
5131static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132PyObject *replace(PyUnicodeObject *self,
5133 PyUnicodeObject *str1,
5134 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005135 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136{
5137 PyUnicodeObject *u;
5138
5139 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005140 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141
Fredrik Lundh347ee272006-05-24 16:35:18 +00005142 if (str1->length == str2->length) {
5143 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005144 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005145 if (str1->length == 1) {
5146 /* replace characters */
5147 Py_UNICODE u1, u2;
5148 if (!findchar(self->str, self->length, str1->str[0]))
5149 goto nothing;
5150 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5151 if (!u)
5152 return NULL;
5153 Py_UNICODE_COPY(u->str, self->str, self->length);
5154 u1 = str1->str[0];
5155 u2 = str2->str[0];
5156 for (i = 0; i < u->length; i++)
5157 if (u->str[i] == u1) {
5158 if (--maxcount < 0)
5159 break;
5160 u->str[i] = u2;
5161 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005163 i = fastsearch(
5164 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005166 if (i < 0)
5167 goto nothing;
5168 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5169 if (!u)
5170 return NULL;
5171 Py_UNICODE_COPY(u->str, self->str, self->length);
5172 while (i <= self->length - str1->length)
5173 if (Py_UNICODE_MATCH(self, i, str1)) {
5174 if (--maxcount < 0)
5175 break;
5176 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5177 i += str1->length;
5178 } else
5179 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005182
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005183 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005184 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 Py_UNICODE *p;
5186
5187 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005188 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189 if (n > maxcount)
5190 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005191 if (n == 0)
5192 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005193 /* new_size = self->length + n * (str2->length - str1->length)); */
5194 delta = (str2->length - str1->length);
5195 if (delta == 0) {
5196 new_size = self->length;
5197 } else {
5198 product = n * (str2->length - str1->length);
5199 if ((product / (str2->length - str1->length)) != n) {
5200 PyErr_SetString(PyExc_OverflowError,
5201 "replace string is too long");
5202 return NULL;
5203 }
5204 new_size = self->length + product;
5205 if (new_size < 0) {
5206 PyErr_SetString(PyExc_OverflowError,
5207 "replace string is too long");
5208 return NULL;
5209 }
5210 }
5211 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005212 if (!u)
5213 return NULL;
5214 i = 0;
5215 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005216 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005217 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005218 while (n-- > 0) {
5219 /* look for next match */
5220 j = i;
5221 while (j <= e) {
5222 if (Py_UNICODE_MATCH(self, j, str1))
5223 break;
5224 j++;
5225 }
5226 if (j > i) {
5227 if (j > e)
5228 break;
5229 /* copy unchanged part [i:j] */
5230 Py_UNICODE_COPY(p, self->str+i, j-i);
5231 p += j - i;
5232 }
5233 /* copy substitution string */
5234 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005235 Py_UNICODE_COPY(p, str2->str, str2->length);
5236 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005237 }
5238 i = j + str1->length;
5239 }
5240 if (i < self->length)
5241 /* copy tail [i:] */
5242 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005243 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005244 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005245 while (n > 0) {
5246 Py_UNICODE_COPY(p, str2->str, str2->length);
5247 p += str2->length;
5248 if (--n <= 0)
5249 break;
5250 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005252 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 }
5254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005256
5257nothing:
5258 /* nothing to replace; return original string (when possible) */
5259 if (PyUnicode_CheckExact(self)) {
5260 Py_INCREF(self);
5261 return (PyObject *) self;
5262 }
5263 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264}
5265
5266/* --- Unicode Object Methods --------------------------------------------- */
5267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005268PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269"S.title() -> unicode\n\
5270\n\
5271Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005272characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273
5274static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005275unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 return fixup(self, fixtitle);
5278}
5279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005280PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281"S.capitalize() -> unicode\n\
5282\n\
5283Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005284have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285
5286static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005287unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 return fixup(self, fixcapitalize);
5290}
5291
5292#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005293PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294"S.capwords() -> unicode\n\
5295\n\
5296Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005297normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298
5299static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005300unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301{
5302 PyObject *list;
5303 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005304 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 /* Split into words */
5307 list = split(self, NULL, -1);
5308 if (!list)
5309 return NULL;
5310
5311 /* Capitalize each word */
5312 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5313 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5314 fixcapitalize);
5315 if (item == NULL)
5316 goto onError;
5317 Py_DECREF(PyList_GET_ITEM(list, i));
5318 PyList_SET_ITEM(list, i, item);
5319 }
5320
5321 /* Join the words to form a new string */
5322 item = PyUnicode_Join(NULL, list);
5323
5324onError:
5325 Py_DECREF(list);
5326 return (PyObject *)item;
5327}
5328#endif
5329
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005330/* Argument converter. Coerces to a single unicode character */
5331
5332static int
5333convert_uc(PyObject *obj, void *addr)
5334{
5335 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5336 PyObject *uniobj;
5337 Py_UNICODE *unistr;
5338
5339 uniobj = PyUnicode_FromObject(obj);
5340 if (uniobj == NULL) {
5341 PyErr_SetString(PyExc_TypeError,
5342 "The fill character cannot be converted to Unicode");
5343 return 0;
5344 }
5345 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5346 PyErr_SetString(PyExc_TypeError,
5347 "The fill character must be exactly one character long");
5348 Py_DECREF(uniobj);
5349 return 0;
5350 }
5351 unistr = PyUnicode_AS_UNICODE(uniobj);
5352 *fillcharloc = unistr[0];
5353 Py_DECREF(uniobj);
5354 return 1;
5355}
5356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005357PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005358"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005360Return S centered in a Unicode string of length width. Padding is\n\
5361done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362
5363static PyObject *
5364unicode_center(PyUnicodeObject *self, PyObject *args)
5365{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005366 Py_ssize_t marg, left;
5367 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005368 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369
Thomas Woutersde017742006-02-16 19:34:37 +00005370 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 return NULL;
5372
Tim Peters7a29bd52001-09-12 03:03:31 +00005373 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 Py_INCREF(self);
5375 return (PyObject*) self;
5376 }
5377
5378 marg = width - self->length;
5379 left = marg / 2 + (marg & width & 1);
5380
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005381 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382}
5383
Marc-André Lemburge5034372000-08-08 08:04:29 +00005384#if 0
5385
5386/* This code should go into some future Unicode collation support
5387 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005388 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005389
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005390/* speedy UTF-16 code point order comparison */
5391/* gleaned from: */
5392/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5393
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005394static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005395{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005396 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005397 0, 0, 0, 0, 0, 0, 0, 0,
5398 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005399 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005400};
5401
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402static int
5403unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5404{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005405 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005406
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 Py_UNICODE *s1 = str1->str;
5408 Py_UNICODE *s2 = str2->str;
5409
5410 len1 = str1->length;
5411 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005412
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005414 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005415
5416 c1 = *s1++;
5417 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005418
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005419 if (c1 > (1<<11) * 26)
5420 c1 += utf16Fixup[c1>>11];
5421 if (c2 > (1<<11) * 26)
5422 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005423 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005424
5425 if (c1 != c2)
5426 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005427
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005428 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 }
5430
5431 return (len1 < len2) ? -1 : (len1 != len2);
5432}
5433
Marc-André Lemburge5034372000-08-08 08:04:29 +00005434#else
5435
5436static int
5437unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5438{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005439 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005440
5441 Py_UNICODE *s1 = str1->str;
5442 Py_UNICODE *s2 = str2->str;
5443
5444 len1 = str1->length;
5445 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005446
Marc-André Lemburge5034372000-08-08 08:04:29 +00005447 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005448 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005449
Fredrik Lundh45714e92001-06-26 16:39:36 +00005450 c1 = *s1++;
5451 c2 = *s2++;
5452
5453 if (c1 != c2)
5454 return (c1 < c2) ? -1 : 1;
5455
Marc-André Lemburge5034372000-08-08 08:04:29 +00005456 len1--; len2--;
5457 }
5458
5459 return (len1 < len2) ? -1 : (len1 != len2);
5460}
5461
5462#endif
5463
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464int PyUnicode_Compare(PyObject *left,
5465 PyObject *right)
5466{
5467 PyUnicodeObject *u = NULL, *v = NULL;
5468 int result;
5469
5470 /* Coerce the two arguments */
5471 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5472 if (u == NULL)
5473 goto onError;
5474 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5475 if (v == NULL)
5476 goto onError;
5477
Thomas Wouters7e474022000-07-16 12:04:32 +00005478 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479 if (v == u) {
5480 Py_DECREF(u);
5481 Py_DECREF(v);
5482 return 0;
5483 }
5484
5485 result = unicode_compare(u, v);
5486
5487 Py_DECREF(u);
5488 Py_DECREF(v);
5489 return result;
5490
5491onError:
5492 Py_XDECREF(u);
5493 Py_XDECREF(v);
5494 return -1;
5495}
5496
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00005497PyObject *PyUnicode_RichCompare(PyObject *left,
5498 PyObject *right,
5499 int op)
5500{
5501 int result;
5502
5503 result = PyUnicode_Compare(left, right);
5504 if (result == -1 && PyErr_Occurred())
5505 goto onError;
5506
5507 /* Convert the return value to a Boolean */
5508 switch (op) {
5509 case Py_EQ:
5510 result = (result == 0);
5511 break;
5512 case Py_NE:
5513 result = (result != 0);
5514 break;
5515 case Py_LE:
5516 result = (result <= 0);
5517 break;
5518 case Py_GE:
5519 result = (result >= 0);
5520 break;
5521 case Py_LT:
5522 result = (result == -1);
5523 break;
5524 case Py_GT:
5525 result = (result == 1);
5526 break;
5527 }
5528 return PyBool_FromLong(result);
5529
5530 onError:
5531
5532 /* Standard case
5533
5534 Type errors mean that PyUnicode_FromObject() could not convert
5535 one of the arguments (usually the right hand side) to Unicode,
5536 ie. we can't handle the comparison request. However, it is
5537 possible that the other object knows a comparison method, which
5538 is why we return Py_NotImplemented to give the other object a
5539 chance.
5540
5541 */
5542 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5543 PyErr_Clear();
5544 Py_INCREF(Py_NotImplemented);
5545 return Py_NotImplemented;
5546 }
5547 if (op != Py_EQ && op != Py_NE)
5548 return NULL;
5549
5550 /* Equality comparison.
5551
5552 This is a special case: we silence any PyExc_UnicodeDecodeError
5553 and instead turn it into a PyErr_UnicodeWarning.
5554
5555 */
5556 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5557 return NULL;
5558 PyErr_Clear();
5559 if (PyErr_Warn(PyExc_UnicodeWarning,
5560 (op == Py_EQ) ?
5561 "Unicode equal comparison "
5562 "failed to convert both arguments to Unicode - "
5563 "interpreting them as being unequal" :
5564 "Unicode unequal comparison "
5565 "failed to convert both arguments to Unicode - "
5566 "interpreting them as being unequal"
5567 ) < 0)
5568 return NULL;
5569 result = (op == Py_NE);
5570 return PyBool_FromLong(result);
5571}
5572
Guido van Rossum403d68b2000-03-13 15:55:09 +00005573int PyUnicode_Contains(PyObject *container,
5574 PyObject *element)
5575{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005576 PyObject *str, *sub;
5577 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005578
5579 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005580 sub = PyUnicode_FromObject(element);
5581 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005582 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005583 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005584 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005585 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005586
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005587 str = PyUnicode_FromObject(container);
5588 if (!str) {
5589 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005590 return -1;
5591 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005592
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005593 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005594
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005595 Py_DECREF(str);
5596 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005597
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005598 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005599}
5600
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601/* Concat to string or Unicode object giving a new Unicode object. */
5602
5603PyObject *PyUnicode_Concat(PyObject *left,
5604 PyObject *right)
5605{
5606 PyUnicodeObject *u = NULL, *v = NULL, *w;
5607
5608 /* Coerce the two arguments */
5609 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5610 if (u == NULL)
5611 goto onError;
5612 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5613 if (v == NULL)
5614 goto onError;
5615
5616 /* Shortcuts */
5617 if (v == unicode_empty) {
5618 Py_DECREF(v);
5619 return (PyObject *)u;
5620 }
5621 if (u == unicode_empty) {
5622 Py_DECREF(u);
5623 return (PyObject *)v;
5624 }
5625
5626 /* Concat the two Unicode strings */
5627 w = _PyUnicode_New(u->length + v->length);
5628 if (w == NULL)
5629 goto onError;
5630 Py_UNICODE_COPY(w->str, u->str, u->length);
5631 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5632
5633 Py_DECREF(u);
5634 Py_DECREF(v);
5635 return (PyObject *)w;
5636
5637onError:
5638 Py_XDECREF(u);
5639 Py_XDECREF(v);
5640 return NULL;
5641}
5642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005643PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644"S.count(sub[, start[, end]]) -> int\n\
5645\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005646Return the number of non-overlapping occurrences of substring sub in\n\
5647Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005648interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649
5650static PyObject *
5651unicode_count(PyUnicodeObject *self, PyObject *args)
5652{
5653 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005654 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005655 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 PyObject *result;
5657
Guido van Rossumb8872e62000-05-09 14:14:27 +00005658 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5659 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 return NULL;
5661
5662 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005663 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 if (substring == NULL)
5665 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005666
Fredrik Lundhc8162812006-05-26 19:33:03 +00005667 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005669 result = PyInt_FromSsize_t(
5670 stringlib_count(self->str + start, end - start,
5671 substring->str, substring->length)
5672 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673
5674 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005675
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 return result;
5677}
5678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005679PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005680"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005682Encodes S using the codec registered for encoding. encoding defaults\n\
5683to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005684handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5686'xmlcharrefreplace' as well as any other name registered with\n\
5687codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688
5689static PyObject *
5690unicode_encode(PyUnicodeObject *self, PyObject *args)
5691{
5692 char *encoding = NULL;
5693 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005694 PyObject *v;
5695
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5697 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005698 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005699 if (v == NULL)
5700 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005701 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5702 PyErr_Format(PyExc_TypeError,
5703 "encoder did not return a string/unicode object "
5704 "(type=%.400s)",
5705 v->ob_type->tp_name);
5706 Py_DECREF(v);
5707 return NULL;
5708 }
5709 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005710
5711 onError:
5712 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005713}
5714
5715PyDoc_STRVAR(decode__doc__,
5716"S.decode([encoding[,errors]]) -> string or unicode\n\
5717\n\
5718Decodes S using the codec registered for encoding. encoding defaults\n\
5719to the default encoding. errors may be given to set a different error\n\
5720handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5721a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5722as well as any other name registerd with codecs.register_error that is\n\
5723able to handle UnicodeDecodeErrors.");
5724
5725static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005726unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005727{
5728 char *encoding = NULL;
5729 char *errors = NULL;
5730 PyObject *v;
5731
5732 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5733 return NULL;
5734 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005735 if (v == NULL)
5736 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005737 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5738 PyErr_Format(PyExc_TypeError,
5739 "decoder did not return a string/unicode object "
5740 "(type=%.400s)",
5741 v->ob_type->tp_name);
5742 Py_DECREF(v);
5743 return NULL;
5744 }
5745 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005746
5747 onError:
5748 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749}
5750
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005751PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752"S.expandtabs([tabsize]) -> unicode\n\
5753\n\
5754Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005755If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756
5757static PyObject*
5758unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5759{
5760 Py_UNICODE *e;
5761 Py_UNICODE *p;
5762 Py_UNICODE *q;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005763 Py_UNICODE *qe;
5764 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 PyUnicodeObject *u;
5766 int tabsize = 8;
5767
5768 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5769 return NULL;
5770
Thomas Wouters7e474022000-07-16 12:04:32 +00005771 /* First pass: determine size of output string */
Guido van Rossum44a93e52008-03-11 21:14:54 +00005772 i = 0; /* chars up to and including most recent \n or \r */
5773 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
5774 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 for (p = self->str; p < e; p++)
5776 if (*p == '\t') {
Neal Norwitz66e64e22007-06-09 04:06:30 +00005777 if (tabsize > 0) {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005778 incr = tabsize - (j % tabsize); /* cannot overflow */
5779 if (j > PY_SSIZE_T_MAX - incr)
5780 goto overflow1;
5781 j += incr;
5782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 }
5784 else {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005785 if (j > PY_SSIZE_T_MAX - 1)
5786 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 j++;
5788 if (*p == '\n' || *p == '\r') {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005789 if (i > PY_SSIZE_T_MAX - j)
5790 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791 i += j;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005792 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793 }
5794 }
5795
Guido van Rossum44a93e52008-03-11 21:14:54 +00005796 if (i > PY_SSIZE_T_MAX - j)
5797 goto overflow1;
Neal Norwitz66e64e22007-06-09 04:06:30 +00005798
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 /* Second pass: create output string and fill it */
5800 u = _PyUnicode_New(i + j);
5801 if (!u)
5802 return NULL;
5803
Guido van Rossum44a93e52008-03-11 21:14:54 +00005804 j = 0; /* same as in first pass */
5805 q = u->str; /* next output char */
5806 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807
5808 for (p = self->str; p < e; p++)
5809 if (*p == '\t') {
5810 if (tabsize > 0) {
5811 i = tabsize - (j % tabsize);
5812 j += i;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005813 while (i--) {
5814 if (q >= qe)
5815 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816 *q++ = ' ';
Guido van Rossum44a93e52008-03-11 21:14:54 +00005817 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 }
5819 }
5820 else {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005821 if (q >= qe)
5822 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 *q++ = *p;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005824 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 if (*p == '\n' || *p == '\r')
5826 j = 0;
5827 }
5828
5829 return (PyObject*) u;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005830
5831 overflow2:
5832 Py_DECREF(u);
5833 overflow1:
5834 PyErr_SetString(PyExc_OverflowError, "new string is too long");
5835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836}
5837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005838PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839"S.find(sub [,start [,end]]) -> int\n\
5840\n\
5841Return the lowest index in S where substring sub is found,\n\
Georg Brandlb4d100c2007-07-29 17:37:22 +00005842such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843arguments start and end are interpreted as in slice notation.\n\
5844\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005845Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846
5847static PyObject *
5848unicode_find(PyUnicodeObject *self, PyObject *args)
5849{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005850 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005851 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005852 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005853 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854
Guido van Rossumb8872e62000-05-09 14:14:27 +00005855 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5856 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005858 substring = PyUnicode_FromObject(substring);
5859 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 return NULL;
5861
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005862 result = stringlib_find_slice(
5863 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5864 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5865 start, end
5866 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867
5868 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005869
5870 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871}
5872
5873static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005874unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875{
5876 if (index < 0 || index >= self->length) {
5877 PyErr_SetString(PyExc_IndexError, "string index out of range");
5878 return NULL;
5879 }
5880
5881 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5882}
5883
5884static long
5885unicode_hash(PyUnicodeObject *self)
5886{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005887 /* Since Unicode objects compare equal to their ASCII string
5888 counterparts, they should use the individual character values
5889 as basis for their hash value. This is needed to assure that
5890 strings and Unicode objects behave in the same way as
5891 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892
Martin v. Löwis18e16552006-02-15 17:27:45 +00005893 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005894 register Py_UNICODE *p;
5895 register long x;
5896
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 if (self->hash != -1)
5898 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005899 len = PyUnicode_GET_SIZE(self);
5900 p = PyUnicode_AS_UNICODE(self);
5901 x = *p << 7;
5902 while (--len >= 0)
5903 x = (1000003*x) ^ *p++;
5904 x ^= PyUnicode_GET_SIZE(self);
5905 if (x == -1)
5906 x = -2;
5907 self->hash = x;
5908 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909}
5910
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005911PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912"S.index(sub [,start [,end]]) -> int\n\
5913\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005914Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915
5916static PyObject *
5917unicode_index(PyUnicodeObject *self, PyObject *args)
5918{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005919 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005920 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005921 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005922 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923
Guido van Rossumb8872e62000-05-09 14:14:27 +00005924 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5925 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005927 substring = PyUnicode_FromObject(substring);
5928 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 return NULL;
5930
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005931 result = stringlib_find_slice(
5932 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5933 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5934 start, end
5935 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936
5937 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005938
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 if (result < 0) {
5940 PyErr_SetString(PyExc_ValueError, "substring not found");
5941 return NULL;
5942 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005943
Martin v. Löwis18e16552006-02-15 17:27:45 +00005944 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945}
5946
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005947PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005948"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005950Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005951at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952
5953static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005954unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955{
5956 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5957 register const Py_UNICODE *e;
5958 int cased;
5959
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 /* Shortcut for single character strings */
5961 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005962 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005964 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005965 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005966 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005967
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 e = p + PyUnicode_GET_SIZE(self);
5969 cased = 0;
5970 for (; p < e; p++) {
5971 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005972
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005974 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 else if (!cased && Py_UNICODE_ISLOWER(ch))
5976 cased = 1;
5977 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005978 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979}
5980
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005981PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005982"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005984Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005985at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986
5987static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005988unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989{
5990 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5991 register const Py_UNICODE *e;
5992 int cased;
5993
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 /* Shortcut for single character strings */
5995 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005996 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005998 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005999 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006000 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006001
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 e = p + PyUnicode_GET_SIZE(self);
6003 cased = 0;
6004 for (; p < e; p++) {
6005 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006006
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006008 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 else if (!cased && Py_UNICODE_ISUPPER(ch))
6010 cased = 1;
6011 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006012 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013}
6014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006015PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006016"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006018Return True if S is a titlecased string and there is at least one\n\
6019character in S, i.e. upper- and titlecase characters may only\n\
6020follow uncased characters and lowercase characters only cased ones.\n\
6021Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022
6023static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006024unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025{
6026 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6027 register const Py_UNICODE *e;
6028 int cased, previous_is_cased;
6029
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 /* Shortcut for single character strings */
6031 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006032 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6033 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006035 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006036 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006037 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006038
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 e = p + PyUnicode_GET_SIZE(self);
6040 cased = 0;
6041 previous_is_cased = 0;
6042 for (; p < e; p++) {
6043 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006044
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6046 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006047 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 previous_is_cased = 1;
6049 cased = 1;
6050 }
6051 else if (Py_UNICODE_ISLOWER(ch)) {
6052 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006053 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 previous_is_cased = 1;
6055 cased = 1;
6056 }
6057 else
6058 previous_is_cased = 0;
6059 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006060 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061}
6062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006063PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006064"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006066Return True if all characters in S are whitespace\n\
6067and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068
6069static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006070unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071{
6072 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6073 register const Py_UNICODE *e;
6074
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 /* Shortcut for single character strings */
6076 if (PyUnicode_GET_SIZE(self) == 1 &&
6077 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006078 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006080 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006081 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006082 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006083
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 e = p + PyUnicode_GET_SIZE(self);
6085 for (; p < e; p++) {
6086 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006087 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006089 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090}
6091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006092PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006093"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006094\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006095Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006096and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006097
6098static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006099unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006100{
6101 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6102 register const Py_UNICODE *e;
6103
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006104 /* Shortcut for single character strings */
6105 if (PyUnicode_GET_SIZE(self) == 1 &&
6106 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006107 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006108
6109 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006110 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006111 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006112
6113 e = p + PyUnicode_GET_SIZE(self);
6114 for (; p < e; p++) {
6115 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006116 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006117 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006118 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006119}
6120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006121PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006122"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006123\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006124Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006125and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006126
6127static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006128unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006129{
6130 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6131 register const Py_UNICODE *e;
6132
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006133 /* Shortcut for single character strings */
6134 if (PyUnicode_GET_SIZE(self) == 1 &&
6135 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006136 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006137
6138 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006139 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006140 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006141
6142 e = p + PyUnicode_GET_SIZE(self);
6143 for (; p < e; p++) {
6144 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006145 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006146 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006147 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006148}
6149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006150PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006151"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006153Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006154False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155
6156static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006157unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158{
6159 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6160 register const Py_UNICODE *e;
6161
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 /* Shortcut for single character strings */
6163 if (PyUnicode_GET_SIZE(self) == 1 &&
6164 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006165 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006167 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006168 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006169 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006170
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 e = p + PyUnicode_GET_SIZE(self);
6172 for (; p < e; p++) {
6173 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006174 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006176 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177}
6178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006179PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006180"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006182Return True if all characters in S are digits\n\
6183and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184
6185static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006186unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187{
6188 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6189 register const Py_UNICODE *e;
6190
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191 /* Shortcut for single character strings */
6192 if (PyUnicode_GET_SIZE(self) == 1 &&
6193 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006194 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006196 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006197 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006198 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006199
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 e = p + PyUnicode_GET_SIZE(self);
6201 for (; p < e; p++) {
6202 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006203 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006205 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206}
6207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006208PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006209"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006211Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006212False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213
6214static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006215unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216{
6217 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6218 register const Py_UNICODE *e;
6219
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 /* Shortcut for single character strings */
6221 if (PyUnicode_GET_SIZE(self) == 1 &&
6222 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006223 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006225 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006226 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006227 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006228
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 e = p + PyUnicode_GET_SIZE(self);
6230 for (; p < e; p++) {
6231 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006232 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006234 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235}
6236
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006237PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238"S.join(sequence) -> unicode\n\
6239\n\
6240Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006241sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242
6243static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006244unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006246 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247}
6248
Martin v. Löwis18e16552006-02-15 17:27:45 +00006249static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250unicode_length(PyUnicodeObject *self)
6251{
6252 return self->length;
6253}
6254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006255PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006256"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257\n\
6258Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006259done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260
6261static PyObject *
6262unicode_ljust(PyUnicodeObject *self, PyObject *args)
6263{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006264 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006265 Py_UNICODE fillchar = ' ';
6266
Martin v. Löwis412fb672006-04-13 06:34:32 +00006267 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268 return NULL;
6269
Tim Peters7a29bd52001-09-12 03:03:31 +00006270 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 Py_INCREF(self);
6272 return (PyObject*) self;
6273 }
6274
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006275 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276}
6277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006278PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279"S.lower() -> unicode\n\
6280\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006281Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282
6283static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006284unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 return fixup(self, fixlower);
6287}
6288
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006289#define LEFTSTRIP 0
6290#define RIGHTSTRIP 1
6291#define BOTHSTRIP 2
6292
6293/* Arrays indexed by above */
6294static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6295
6296#define STRIPNAME(i) (stripformat[i]+3)
6297
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006298/* externally visible for str.strip(unicode) */
6299PyObject *
6300_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6301{
6302 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006303 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006304 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006305 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6306 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006307
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006308 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6309
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006310 i = 0;
6311 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006312 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6313 i++;
6314 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006315 }
6316
6317 j = len;
6318 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006319 do {
6320 j--;
6321 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6322 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006323 }
6324
6325 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006326 Py_INCREF(self);
6327 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006328 }
6329 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006330 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006331}
6332
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333
6334static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006335do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006337 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006338 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006339
6340 i = 0;
6341 if (striptype != RIGHTSTRIP) {
6342 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6343 i++;
6344 }
6345 }
6346
6347 j = len;
6348 if (striptype != LEFTSTRIP) {
6349 do {
6350 j--;
6351 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6352 j++;
6353 }
6354
6355 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6356 Py_INCREF(self);
6357 return (PyObject*)self;
6358 }
6359 else
6360 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361}
6362
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006363
6364static PyObject *
6365do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6366{
6367 PyObject *sep = NULL;
6368
6369 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6370 return NULL;
6371
6372 if (sep != NULL && sep != Py_None) {
6373 if (PyUnicode_Check(sep))
6374 return _PyUnicode_XStrip(self, striptype, sep);
6375 else if (PyString_Check(sep)) {
6376 PyObject *res;
6377 sep = PyUnicode_FromObject(sep);
6378 if (sep==NULL)
6379 return NULL;
6380 res = _PyUnicode_XStrip(self, striptype, sep);
6381 Py_DECREF(sep);
6382 return res;
6383 }
6384 else {
6385 PyErr_Format(PyExc_TypeError,
6386 "%s arg must be None, unicode or str",
6387 STRIPNAME(striptype));
6388 return NULL;
6389 }
6390 }
6391
6392 return do_strip(self, striptype);
6393}
6394
6395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006396PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006397"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006398\n\
6399Return a copy of the string S with leading and trailing\n\
6400whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006401If chars is given and not None, remove characters in chars instead.\n\
6402If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006403
6404static PyObject *
6405unicode_strip(PyUnicodeObject *self, PyObject *args)
6406{
6407 if (PyTuple_GET_SIZE(args) == 0)
6408 return do_strip(self, BOTHSTRIP); /* Common case */
6409 else
6410 return do_argstrip(self, BOTHSTRIP, args);
6411}
6412
6413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006414PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006415"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006416\n\
6417Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006418If chars is given and not None, remove characters in chars instead.\n\
6419If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006420
6421static PyObject *
6422unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6423{
6424 if (PyTuple_GET_SIZE(args) == 0)
6425 return do_strip(self, LEFTSTRIP); /* Common case */
6426 else
6427 return do_argstrip(self, LEFTSTRIP, args);
6428}
6429
6430
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006431PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006432"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006433\n\
6434Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006435If chars is given and not None, remove characters in chars instead.\n\
6436If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006437
6438static PyObject *
6439unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6440{
6441 if (PyTuple_GET_SIZE(args) == 0)
6442 return do_strip(self, RIGHTSTRIP); /* Common case */
6443 else
6444 return do_argstrip(self, RIGHTSTRIP, args);
6445}
6446
6447
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006449unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450{
6451 PyUnicodeObject *u;
6452 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006453 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006454 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455
6456 if (len < 0)
6457 len = 0;
6458
Tim Peters7a29bd52001-09-12 03:03:31 +00006459 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 /* no repeat, return original string */
6461 Py_INCREF(str);
6462 return (PyObject*) str;
6463 }
Tim Peters8f422462000-09-09 06:13:41 +00006464
6465 /* ensure # of chars needed doesn't overflow int and # of bytes
6466 * needed doesn't overflow size_t
6467 */
6468 nchars = len * str->length;
6469 if (len && nchars / len != str->length) {
6470 PyErr_SetString(PyExc_OverflowError,
6471 "repeated string is too long");
6472 return NULL;
6473 }
6474 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6475 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6476 PyErr_SetString(PyExc_OverflowError,
6477 "repeated string is too long");
6478 return NULL;
6479 }
6480 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 if (!u)
6482 return NULL;
6483
6484 p = u->str;
6485
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006486 if (str->length == 1 && len > 0) {
6487 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006488 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006489 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006490 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006491 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006492 done = str->length;
6493 }
6494 while (done < nchars) {
6495 int n = (done <= nchars-done) ? done : nchars-done;
6496 Py_UNICODE_COPY(p+done, p, n);
6497 done += n;
6498 }
6499 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500
6501 return (PyObject*) u;
6502}
6503
6504PyObject *PyUnicode_Replace(PyObject *obj,
6505 PyObject *subobj,
6506 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006507 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508{
6509 PyObject *self;
6510 PyObject *str1;
6511 PyObject *str2;
6512 PyObject *result;
6513
6514 self = PyUnicode_FromObject(obj);
6515 if (self == NULL)
6516 return NULL;
6517 str1 = PyUnicode_FromObject(subobj);
6518 if (str1 == NULL) {
6519 Py_DECREF(self);
6520 return NULL;
6521 }
6522 str2 = PyUnicode_FromObject(replobj);
6523 if (str2 == NULL) {
6524 Py_DECREF(self);
6525 Py_DECREF(str1);
6526 return NULL;
6527 }
Tim Petersced69f82003-09-16 20:30:58 +00006528 result = replace((PyUnicodeObject *)self,
6529 (PyUnicodeObject *)str1,
6530 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 maxcount);
6532 Py_DECREF(self);
6533 Py_DECREF(str1);
6534 Py_DECREF(str2);
6535 return result;
6536}
6537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006538PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539"S.replace (old, new[, maxsplit]) -> unicode\n\
6540\n\
6541Return a copy of S with all occurrences of substring\n\
6542old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006543given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544
6545static PyObject*
6546unicode_replace(PyUnicodeObject *self, PyObject *args)
6547{
6548 PyUnicodeObject *str1;
6549 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006550 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 PyObject *result;
6552
Martin v. Löwis18e16552006-02-15 17:27:45 +00006553 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 return NULL;
6555 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6556 if (str1 == NULL)
6557 return NULL;
6558 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006559 if (str2 == NULL) {
6560 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563
6564 result = replace(self, str1, str2, maxcount);
6565
6566 Py_DECREF(str1);
6567 Py_DECREF(str2);
6568 return result;
6569}
6570
6571static
6572PyObject *unicode_repr(PyObject *unicode)
6573{
6574 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6575 PyUnicode_GET_SIZE(unicode),
6576 1);
6577}
6578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006579PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580"S.rfind(sub [,start [,end]]) -> int\n\
6581\n\
6582Return the highest index in S where substring sub is found,\n\
Georg Brandlb4d100c2007-07-29 17:37:22 +00006583such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584arguments start and end are interpreted as in slice notation.\n\
6585\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006586Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587
6588static PyObject *
6589unicode_rfind(PyUnicodeObject *self, PyObject *args)
6590{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006591 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006592 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006593 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006594 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595
Guido van Rossumb8872e62000-05-09 14:14:27 +00006596 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6597 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006599 substring = PyUnicode_FromObject(substring);
6600 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 return NULL;
6602
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006603 result = stringlib_rfind_slice(
6604 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6605 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6606 start, end
6607 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608
6609 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006610
6611 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612}
6613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006614PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615"S.rindex(sub [,start [,end]]) -> int\n\
6616\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006617Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618
6619static PyObject *
6620unicode_rindex(PyUnicodeObject *self, PyObject *args)
6621{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006622 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006623 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006624 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006625 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626
Guido van Rossumb8872e62000-05-09 14:14:27 +00006627 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6628 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006630 substring = PyUnicode_FromObject(substring);
6631 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 return NULL;
6633
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006634 result = stringlib_rfind_slice(
6635 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6636 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6637 start, end
6638 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639
6640 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006641
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 if (result < 0) {
6643 PyErr_SetString(PyExc_ValueError, "substring not found");
6644 return NULL;
6645 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006646 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647}
6648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006649PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006650"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651\n\
6652Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006653done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654
6655static PyObject *
6656unicode_rjust(PyUnicodeObject *self, PyObject *args)
6657{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006658 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006659 Py_UNICODE fillchar = ' ';
6660
Martin v. Löwis412fb672006-04-13 06:34:32 +00006661 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 return NULL;
6663
Tim Peters7a29bd52001-09-12 03:03:31 +00006664 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 Py_INCREF(self);
6666 return (PyObject*) self;
6667 }
6668
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006669 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670}
6671
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006673unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674{
6675 /* standard clamping */
6676 if (start < 0)
6677 start = 0;
6678 if (end < 0)
6679 end = 0;
6680 if (end > self->length)
6681 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006682 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 /* full slice, return original string */
6684 Py_INCREF(self);
6685 return (PyObject*) self;
6686 }
6687 if (start > end)
6688 start = end;
6689 /* copy slice */
6690 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6691 end - start);
6692}
6693
6694PyObject *PyUnicode_Split(PyObject *s,
6695 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006696 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697{
6698 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006699
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 s = PyUnicode_FromObject(s);
6701 if (s == NULL)
6702 return NULL;
6703 if (sep != NULL) {
6704 sep = PyUnicode_FromObject(sep);
6705 if (sep == NULL) {
6706 Py_DECREF(s);
6707 return NULL;
6708 }
6709 }
6710
6711 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6712
6713 Py_DECREF(s);
6714 Py_XDECREF(sep);
6715 return result;
6716}
6717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006718PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719"S.split([sep [,maxsplit]]) -> list of strings\n\
6720\n\
6721Return a list of the words in S, using sep as the\n\
6722delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006723splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006724any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725
6726static PyObject*
6727unicode_split(PyUnicodeObject *self, PyObject *args)
6728{
6729 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006730 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731
Martin v. Löwis18e16552006-02-15 17:27:45 +00006732 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 return NULL;
6734
6735 if (substring == Py_None)
6736 return split(self, NULL, maxcount);
6737 else if (PyUnicode_Check(substring))
6738 return split(self, (PyUnicodeObject *)substring, maxcount);
6739 else
6740 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6741}
6742
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006743PyObject *
6744PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6745{
6746 PyObject* str_obj;
6747 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006748 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006749
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006750 str_obj = PyUnicode_FromObject(str_in);
6751 if (!str_obj)
6752 return NULL;
6753 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006754 if (!sep_obj) {
6755 Py_DECREF(str_obj);
6756 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006757 }
6758
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006759 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006760 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6761 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6762 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006763
Fredrik Lundhb9479482006-05-26 17:22:38 +00006764 Py_DECREF(sep_obj);
6765 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006766
6767 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006768}
6769
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006770
6771PyObject *
6772PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6773{
6774 PyObject* str_obj;
6775 PyObject* sep_obj;
6776 PyObject* out;
6777
6778 str_obj = PyUnicode_FromObject(str_in);
6779 if (!str_obj)
6780 return NULL;
6781 sep_obj = PyUnicode_FromObject(sep_in);
6782 if (!sep_obj) {
6783 Py_DECREF(str_obj);
6784 return NULL;
6785 }
6786
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006787 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006788 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6789 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6790 );
6791
6792 Py_DECREF(sep_obj);
6793 Py_DECREF(str_obj);
6794
6795 return out;
6796}
6797
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006798PyDoc_STRVAR(partition__doc__,
6799"S.partition(sep) -> (head, sep, tail)\n\
6800\n\
6801Searches for the separator sep in S, and returns the part before it,\n\
6802the separator itself, and the part after it. If the separator is not\n\
6803found, returns S and two empty strings.");
6804
6805static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006806unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006807{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006808 return PyUnicode_Partition((PyObject *)self, separator);
6809}
6810
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006811PyDoc_STRVAR(rpartition__doc__,
Neal Norwitz29a5fdb2006-09-05 02:21:38 +00006812"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006813\n\
6814Searches for the separator sep in S, starting at the end of S, and returns\n\
6815the part before it, the separator itself, and the part after it. If the\n\
Neal Norwitz29a5fdb2006-09-05 02:21:38 +00006816separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006817
6818static PyObject*
6819unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6820{
6821 return PyUnicode_RPartition((PyObject *)self, separator);
6822}
6823
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006824PyObject *PyUnicode_RSplit(PyObject *s,
6825 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006826 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006827{
6828 PyObject *result;
6829
6830 s = PyUnicode_FromObject(s);
6831 if (s == NULL)
6832 return NULL;
6833 if (sep != NULL) {
6834 sep = PyUnicode_FromObject(sep);
6835 if (sep == NULL) {
6836 Py_DECREF(s);
6837 return NULL;
6838 }
6839 }
6840
6841 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6842
6843 Py_DECREF(s);
6844 Py_XDECREF(sep);
6845 return result;
6846}
6847
6848PyDoc_STRVAR(rsplit__doc__,
6849"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6850\n\
6851Return a list of the words in S, using sep as the\n\
6852delimiter string, starting at the end of the string and\n\
6853working to the front. If maxsplit is given, at most maxsplit\n\
6854splits are done. If sep is not specified, any whitespace string\n\
6855is a separator.");
6856
6857static PyObject*
6858unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6859{
6860 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006861 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006862
Martin v. Löwis18e16552006-02-15 17:27:45 +00006863 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006864 return NULL;
6865
6866 if (substring == Py_None)
6867 return rsplit(self, NULL, maxcount);
6868 else if (PyUnicode_Check(substring))
6869 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6870 else
6871 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6872}
6873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006874PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006875"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876\n\
6877Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006878Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006879is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880
6881static PyObject*
6882unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6883{
Guido van Rossum86662912000-04-11 15:38:46 +00006884 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885
Guido van Rossum86662912000-04-11 15:38:46 +00006886 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 return NULL;
6888
Guido van Rossum86662912000-04-11 15:38:46 +00006889 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890}
6891
6892static
6893PyObject *unicode_str(PyUnicodeObject *self)
6894{
Fred Drakee4315f52000-05-09 19:53:39 +00006895 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896}
6897
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006898PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899"S.swapcase() -> unicode\n\
6900\n\
6901Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006902and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903
6904static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006905unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 return fixup(self, fixswapcase);
6908}
6909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006910PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911"S.translate(table) -> unicode\n\
6912\n\
6913Return a copy of the string S, where all characters have been mapped\n\
6914through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006915Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6916Unmapped characters are left untouched. Characters mapped to None\n\
6917are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918
6919static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006920unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921{
Tim Petersced69f82003-09-16 20:30:58 +00006922 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006924 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 "ignore");
6926}
6927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006928PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929"S.upper() -> unicode\n\
6930\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006931Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932
6933static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006934unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 return fixup(self, fixupper);
6937}
6938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006939PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940"S.zfill(width) -> unicode\n\
6941\n\
6942Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006943of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944
6945static PyObject *
6946unicode_zfill(PyUnicodeObject *self, PyObject *args)
6947{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006948 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 PyUnicodeObject *u;
6950
Martin v. Löwis18e16552006-02-15 17:27:45 +00006951 Py_ssize_t width;
6952 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953 return NULL;
6954
6955 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006956 if (PyUnicode_CheckExact(self)) {
6957 Py_INCREF(self);
6958 return (PyObject*) self;
6959 }
6960 else
6961 return PyUnicode_FromUnicode(
6962 PyUnicode_AS_UNICODE(self),
6963 PyUnicode_GET_SIZE(self)
6964 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 }
6966
6967 fill = width - self->length;
6968
6969 u = pad(self, fill, 0, '0');
6970
Walter Dörwald068325e2002-04-15 13:36:47 +00006971 if (u == NULL)
6972 return NULL;
6973
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 if (u->str[fill] == '+' || u->str[fill] == '-') {
6975 /* move sign to beginning of string */
6976 u->str[0] = u->str[fill];
6977 u->str[fill] = '0';
6978 }
6979
6980 return (PyObject*) u;
6981}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982
6983#if 0
6984static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006985unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 return PyInt_FromLong(unicode_freelist_size);
6988}
6989#endif
6990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006991PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006992"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006994Return True if S starts with the specified prefix, False otherwise.\n\
6995With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006996With optional end, stop comparing S at that position.\n\
6997prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998
6999static PyObject *
7000unicode_startswith(PyUnicodeObject *self,
7001 PyObject *args)
7002{
Georg Brandl24250812006-06-09 18:45:48 +00007003 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007005 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007006 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007007 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008
Georg Brandl24250812006-06-09 18:45:48 +00007009 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007010 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007012 if (PyTuple_Check(subobj)) {
7013 Py_ssize_t i;
7014 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7015 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7016 PyTuple_GET_ITEM(subobj, i));
7017 if (substring == NULL)
7018 return NULL;
7019 result = tailmatch(self, substring, start, end, -1);
7020 Py_DECREF(substring);
7021 if (result) {
7022 Py_RETURN_TRUE;
7023 }
7024 }
7025 /* nothing matched */
7026 Py_RETURN_FALSE;
7027 }
7028 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007030 return NULL;
7031 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007033 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034}
7035
7036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007037PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007038"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007040Return True if S ends with the specified suffix, False otherwise.\n\
7041With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007042With optional end, stop comparing S at that position.\n\
7043suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044
7045static PyObject *
7046unicode_endswith(PyUnicodeObject *self,
7047 PyObject *args)
7048{
Georg Brandl24250812006-06-09 18:45:48 +00007049 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007051 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007052 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007053 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054
Georg Brandl24250812006-06-09 18:45:48 +00007055 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7056 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007058 if (PyTuple_Check(subobj)) {
7059 Py_ssize_t i;
7060 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7061 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7062 PyTuple_GET_ITEM(subobj, i));
7063 if (substring == NULL)
7064 return NULL;
7065 result = tailmatch(self, substring, start, end, +1);
7066 Py_DECREF(substring);
7067 if (result) {
7068 Py_RETURN_TRUE;
7069 }
7070 }
7071 Py_RETURN_FALSE;
7072 }
7073 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007075 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076
Georg Brandl24250812006-06-09 18:45:48 +00007077 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007079 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080}
7081
7082
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007083
7084static PyObject *
7085unicode_getnewargs(PyUnicodeObject *v)
7086{
7087 return Py_BuildValue("(u#)", v->str, v->length);
7088}
7089
7090
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091static PyMethodDef unicode_methods[] = {
7092
7093 /* Order is according to common usage: often used methods should
7094 appear first, since lookup is done sequentially. */
7095
Georg Brandlecdc0a92006-03-30 12:19:07 +00007096 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007097 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7098 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007099 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007100 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7101 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7102 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7103 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7104 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7105 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7106 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007107 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007108 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7109 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7110 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007111 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007112 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007113/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7114 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7115 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7116 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007117 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007118 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007119 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007120 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007121 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7122 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7123 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7124 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7125 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7126 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7127 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7128 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7129 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7130 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7131 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7132 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7133 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7134 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007135 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007136#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007137 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138#endif
7139
7140#if 0
7141 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007142 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143#endif
7144
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007145 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146 {NULL, NULL}
7147};
7148
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007149static PyObject *
7150unicode_mod(PyObject *v, PyObject *w)
7151{
7152 if (!PyUnicode_Check(v)) {
7153 Py_INCREF(Py_NotImplemented);
7154 return Py_NotImplemented;
7155 }
7156 return PyUnicode_Format(v, w);
7157}
7158
7159static PyNumberMethods unicode_as_number = {
7160 0, /*nb_add*/
7161 0, /*nb_subtract*/
7162 0, /*nb_multiply*/
7163 0, /*nb_divide*/
7164 unicode_mod, /*nb_remainder*/
7165};
7166
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007168 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007169 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007170 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7171 (ssizeargfunc) unicode_getitem, /* sq_item */
7172 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 0, /* sq_ass_item */
7174 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007175 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176};
7177
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007178static PyObject*
7179unicode_subscript(PyUnicodeObject* self, PyObject* item)
7180{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007181 if (PyIndex_Check(item)) {
7182 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007183 if (i == -1 && PyErr_Occurred())
7184 return NULL;
7185 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007186 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007187 return unicode_getitem(self, i);
7188 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007189 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007190 Py_UNICODE* source_buf;
7191 Py_UNICODE* result_buf;
7192 PyObject* result;
7193
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007194 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007195 &start, &stop, &step, &slicelength) < 0) {
7196 return NULL;
7197 }
7198
7199 if (slicelength <= 0) {
7200 return PyUnicode_FromUnicode(NULL, 0);
7201 } else {
7202 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitzb3635f92008-03-18 04:17:36 +00007203 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7204 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007205
7206 if (result_buf == NULL)
7207 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007208
7209 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7210 result_buf[i] = source_buf[cur];
7211 }
Tim Petersced69f82003-09-16 20:30:58 +00007212
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007213 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitzb3635f92008-03-18 04:17:36 +00007214 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007215 return result;
7216 }
7217 } else {
7218 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7219 return NULL;
7220 }
7221}
7222
7223static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007224 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007225 (binaryfunc)unicode_subscript, /* mp_subscript */
7226 (objobjargproc)0, /* mp_ass_subscript */
7227};
7228
Martin v. Löwis18e16552006-02-15 17:27:45 +00007229static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007231 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 const void **ptr)
7233{
7234 if (index != 0) {
7235 PyErr_SetString(PyExc_SystemError,
7236 "accessing non-existent unicode segment");
7237 return -1;
7238 }
7239 *ptr = (void *) self->str;
7240 return PyUnicode_GET_DATA_SIZE(self);
7241}
7242
Martin v. Löwis18e16552006-02-15 17:27:45 +00007243static Py_ssize_t
7244unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 const void **ptr)
7246{
7247 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007248 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249 return -1;
7250}
7251
7252static int
7253unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007254 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255{
7256 if (lenp)
7257 *lenp = PyUnicode_GET_DATA_SIZE(self);
7258 return 1;
7259}
7260
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007261static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007263 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 const void **ptr)
7265{
7266 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007267
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 if (index != 0) {
7269 PyErr_SetString(PyExc_SystemError,
7270 "accessing non-existent unicode segment");
7271 return -1;
7272 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007273 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274 if (str == NULL)
7275 return -1;
7276 *ptr = (void *) PyString_AS_STRING(str);
7277 return PyString_GET_SIZE(str);
7278}
7279
7280/* Helpers for PyUnicode_Format() */
7281
7282static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007283getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007285 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286 if (argidx < arglen) {
7287 (*p_argidx)++;
7288 if (arglen < 0)
7289 return args;
7290 else
7291 return PyTuple_GetItem(args, argidx);
7292 }
7293 PyErr_SetString(PyExc_TypeError,
7294 "not enough arguments for format string");
7295 return NULL;
7296}
7297
7298#define F_LJUST (1<<0)
7299#define F_SIGN (1<<1)
7300#define F_BLANK (1<<2)
7301#define F_ALT (1<<3)
7302#define F_ZERO (1<<4)
7303
Martin v. Löwis18e16552006-02-15 17:27:45 +00007304static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007305strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007307 register Py_ssize_t i;
7308 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309 for (i = len - 1; i >= 0; i--)
7310 buffer[i] = (Py_UNICODE) charbuffer[i];
7311
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312 return len;
7313}
7314
Neal Norwitzfc76d632006-01-10 06:03:13 +00007315static int
7316doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7317{
Tim Peters15231542006-02-16 01:08:01 +00007318 Py_ssize_t result;
7319
Neal Norwitzfc76d632006-01-10 06:03:13 +00007320 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007321 result = strtounicode(buffer, (char *)buffer);
7322 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007323}
7324
7325static int
7326longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7327{
Tim Peters15231542006-02-16 01:08:01 +00007328 Py_ssize_t result;
7329
Neal Norwitzfc76d632006-01-10 06:03:13 +00007330 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007331 result = strtounicode(buffer, (char *)buffer);
7332 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007333}
7334
Guido van Rossum078151d2002-08-11 04:24:12 +00007335/* XXX To save some code duplication, formatfloat/long/int could have been
7336 shared with stringobject.c, converting from 8-bit to Unicode after the
7337 formatting is done. */
7338
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339static int
7340formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007341 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 int flags,
7343 int prec,
7344 int type,
7345 PyObject *v)
7346{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007347 /* fmt = '%#.' + `prec` + `type`
7348 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349 char fmt[20];
7350 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007351
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352 x = PyFloat_AsDouble(v);
7353 if (x == -1.0 && PyErr_Occurred())
7354 return -1;
7355 if (prec < 0)
7356 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7358 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007359 /* Worst case length calc to ensure no buffer overrun:
7360
7361 'g' formats:
7362 fmt = %#.<prec>g
7363 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7364 for any double rep.)
7365 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7366
7367 'f' formats:
7368 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7369 len = 1 + 50 + 1 + prec = 52 + prec
7370
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007371 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007372 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007373
7374 */
Georg Brandlc5db9232007-07-12 08:38:04 +00007375 if (((type == 'g' || type == 'G') &&
7376 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007377 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007378 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007379 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007380 return -1;
7381 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007382 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7383 (flags&F_ALT) ? "#" : "",
7384 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007385 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386}
7387
Tim Peters38fd5b62000-09-21 05:43:11 +00007388static PyObject*
7389formatlong(PyObject *val, int flags, int prec, int type)
7390{
7391 char *buf;
7392 int i, len;
7393 PyObject *str; /* temporary string object. */
7394 PyUnicodeObject *result;
7395
7396 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7397 if (!str)
7398 return NULL;
7399 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007400 if (!result) {
7401 Py_DECREF(str);
7402 return NULL;
7403 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007404 for (i = 0; i < len; i++)
7405 result->str[i] = buf[i];
7406 result->str[len] = 0;
7407 Py_DECREF(str);
7408 return (PyObject*)result;
7409}
7410
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411static int
7412formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007413 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414 int flags,
7415 int prec,
7416 int type,
7417 PyObject *v)
7418{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007419 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007420 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7421 * + 1 + 1
7422 * = 24
7423 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007424 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007425 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 long x;
7427
7428 x = PyInt_AsLong(v);
7429 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007430 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007431 if (x < 0 && type == 'u') {
7432 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007433 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007434 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7435 sign = "-";
7436 else
7437 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007439 prec = 1;
7440
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007441 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7442 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007443 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007444 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007445 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007446 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007447 return -1;
7448 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007449
7450 if ((flags & F_ALT) &&
7451 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007452 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007453 * of issues that cause pain:
7454 * - when 0 is being converted, the C standard leaves off
7455 * the '0x' or '0X', which is inconsistent with other
7456 * %#x/%#X conversions and inconsistent with Python's
7457 * hex() function
7458 * - there are platforms that violate the standard and
7459 * convert 0 with the '0x' or '0X'
7460 * (Metrowerks, Compaq Tru64)
7461 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007462 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007463 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007464 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007465 * We can achieve the desired consistency by inserting our
7466 * own '0x' or '0X' prefix, and substituting %x/%X in place
7467 * of %#x/%#X.
7468 *
7469 * Note that this is the same approach as used in
7470 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007471 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007472 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7473 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007474 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007475 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007476 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7477 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007478 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007479 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007480 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007481 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007482 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007483 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484}
7485
7486static int
7487formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007488 size_t buflen,
7489 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007491 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007492 if (PyUnicode_Check(v)) {
7493 if (PyUnicode_GET_SIZE(v) != 1)
7494 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007498 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007499 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007500 goto onError;
7501 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7502 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503
7504 else {
7505 /* Integer input truncated to a character */
7506 long x;
7507 x = PyInt_AsLong(v);
7508 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007509 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007510#ifdef Py_UNICODE_WIDE
7511 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007512 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007513 "%c arg not in range(0x110000) "
7514 "(wide Python build)");
7515 return -1;
7516 }
7517#else
7518 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007519 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007520 "%c arg not in range(0x10000) "
7521 "(narrow Python build)");
7522 return -1;
7523 }
7524#endif
7525 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 }
7527 buf[1] = '\0';
7528 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007529
7530 onError:
7531 PyErr_SetString(PyExc_TypeError,
7532 "%c requires int or char");
7533 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534}
7535
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007536/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7537
7538 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7539 chars are formatted. XXX This is a magic number. Each formatting
7540 routine does bounds checking to ensure no overflow, but a better
7541 solution may be to malloc a buffer of appropriate size for each
7542 format. For now, the current solution is sufficient.
7543*/
7544#define FORMATBUFLEN (size_t)120
7545
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546PyObject *PyUnicode_Format(PyObject *format,
7547 PyObject *args)
7548{
7549 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007550 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 int args_owned = 0;
7552 PyUnicodeObject *result = NULL;
7553 PyObject *dict = NULL;
7554 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007555
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 if (format == NULL || args == NULL) {
7557 PyErr_BadInternalCall();
7558 return NULL;
7559 }
7560 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007561 if (uformat == NULL)
7562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563 fmt = PyUnicode_AS_UNICODE(uformat);
7564 fmtcnt = PyUnicode_GET_SIZE(uformat);
7565
7566 reslen = rescnt = fmtcnt + 100;
7567 result = _PyUnicode_New(reslen);
7568 if (result == NULL)
7569 goto onError;
7570 res = PyUnicode_AS_UNICODE(result);
7571
7572 if (PyTuple_Check(args)) {
7573 arglen = PyTuple_Size(args);
7574 argidx = 0;
7575 }
7576 else {
7577 arglen = -1;
7578 argidx = -2;
7579 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007580 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7581 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 dict = args;
7583
7584 while (--fmtcnt >= 0) {
7585 if (*fmt != '%') {
7586 if (--rescnt < 0) {
7587 rescnt = fmtcnt + 100;
7588 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007589 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007590 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7592 --rescnt;
7593 }
7594 *res++ = *fmt++;
7595 }
7596 else {
7597 /* Got a format specifier */
7598 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007599 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 Py_UNICODE c = '\0';
7602 Py_UNICODE fill;
7603 PyObject *v = NULL;
7604 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007605 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007607 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007608 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609
7610 fmt++;
7611 if (*fmt == '(') {
7612 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007613 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614 PyObject *key;
7615 int pcount = 1;
7616
7617 if (dict == NULL) {
7618 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007619 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620 goto onError;
7621 }
7622 ++fmt;
7623 --fmtcnt;
7624 keystart = fmt;
7625 /* Skip over balanced parentheses */
7626 while (pcount > 0 && --fmtcnt >= 0) {
7627 if (*fmt == ')')
7628 --pcount;
7629 else if (*fmt == '(')
7630 ++pcount;
7631 fmt++;
7632 }
7633 keylen = fmt - keystart - 1;
7634 if (fmtcnt < 0 || pcount > 0) {
7635 PyErr_SetString(PyExc_ValueError,
7636 "incomplete format key");
7637 goto onError;
7638 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007639#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007640 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641 then looked up since Python uses strings to hold
7642 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007643 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644 key = PyUnicode_EncodeUTF8(keystart,
7645 keylen,
7646 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007647#else
7648 key = PyUnicode_FromUnicode(keystart, keylen);
7649#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650 if (key == NULL)
7651 goto onError;
7652 if (args_owned) {
7653 Py_DECREF(args);
7654 args_owned = 0;
7655 }
7656 args = PyObject_GetItem(dict, key);
7657 Py_DECREF(key);
7658 if (args == NULL) {
7659 goto onError;
7660 }
7661 args_owned = 1;
7662 arglen = -1;
7663 argidx = -2;
7664 }
7665 while (--fmtcnt >= 0) {
7666 switch (c = *fmt++) {
7667 case '-': flags |= F_LJUST; continue;
7668 case '+': flags |= F_SIGN; continue;
7669 case ' ': flags |= F_BLANK; continue;
7670 case '#': flags |= F_ALT; continue;
7671 case '0': flags |= F_ZERO; continue;
7672 }
7673 break;
7674 }
7675 if (c == '*') {
7676 v = getnextarg(args, arglen, &argidx);
7677 if (v == NULL)
7678 goto onError;
7679 if (!PyInt_Check(v)) {
7680 PyErr_SetString(PyExc_TypeError,
7681 "* wants int");
7682 goto onError;
7683 }
7684 width = PyInt_AsLong(v);
7685 if (width < 0) {
7686 flags |= F_LJUST;
7687 width = -width;
7688 }
7689 if (--fmtcnt >= 0)
7690 c = *fmt++;
7691 }
7692 else if (c >= '0' && c <= '9') {
7693 width = c - '0';
7694 while (--fmtcnt >= 0) {
7695 c = *fmt++;
7696 if (c < '0' || c > '9')
7697 break;
7698 if ((width*10) / 10 != width) {
7699 PyErr_SetString(PyExc_ValueError,
7700 "width too big");
7701 goto onError;
7702 }
7703 width = width*10 + (c - '0');
7704 }
7705 }
7706 if (c == '.') {
7707 prec = 0;
7708 if (--fmtcnt >= 0)
7709 c = *fmt++;
7710 if (c == '*') {
7711 v = getnextarg(args, arglen, &argidx);
7712 if (v == NULL)
7713 goto onError;
7714 if (!PyInt_Check(v)) {
7715 PyErr_SetString(PyExc_TypeError,
7716 "* wants int");
7717 goto onError;
7718 }
7719 prec = PyInt_AsLong(v);
7720 if (prec < 0)
7721 prec = 0;
7722 if (--fmtcnt >= 0)
7723 c = *fmt++;
7724 }
7725 else if (c >= '0' && c <= '9') {
7726 prec = c - '0';
7727 while (--fmtcnt >= 0) {
7728 c = Py_CHARMASK(*fmt++);
7729 if (c < '0' || c > '9')
7730 break;
7731 if ((prec*10) / 10 != prec) {
7732 PyErr_SetString(PyExc_ValueError,
7733 "prec too big");
7734 goto onError;
7735 }
7736 prec = prec*10 + (c - '0');
7737 }
7738 }
7739 } /* prec */
7740 if (fmtcnt >= 0) {
7741 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742 if (--fmtcnt >= 0)
7743 c = *fmt++;
7744 }
7745 }
7746 if (fmtcnt < 0) {
7747 PyErr_SetString(PyExc_ValueError,
7748 "incomplete format");
7749 goto onError;
7750 }
7751 if (c != '%') {
7752 v = getnextarg(args, arglen, &argidx);
7753 if (v == NULL)
7754 goto onError;
7755 }
7756 sign = 0;
7757 fill = ' ';
7758 switch (c) {
7759
7760 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007761 pbuf = formatbuf;
7762 /* presume that buffer length is at least 1 */
7763 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764 len = 1;
7765 break;
7766
7767 case 's':
7768 case 'r':
7769 if (PyUnicode_Check(v) && c == 's') {
7770 temp = v;
7771 Py_INCREF(temp);
7772 }
7773 else {
7774 PyObject *unicode;
7775 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007776 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777 else
7778 temp = PyObject_Repr(v);
7779 if (temp == NULL)
7780 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007781 if (PyUnicode_Check(temp))
7782 /* nothing to do */;
7783 else if (PyString_Check(temp)) {
7784 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007785 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007787 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007789 Py_DECREF(temp);
7790 temp = unicode;
7791 if (temp == NULL)
7792 goto onError;
7793 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007794 else {
7795 Py_DECREF(temp);
7796 PyErr_SetString(PyExc_TypeError,
7797 "%s argument has non-string str()");
7798 goto onError;
7799 }
7800 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007801 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 len = PyUnicode_GET_SIZE(temp);
7803 if (prec >= 0 && len > prec)
7804 len = prec;
7805 break;
7806
7807 case 'i':
7808 case 'd':
7809 case 'u':
7810 case 'o':
7811 case 'x':
7812 case 'X':
7813 if (c == 'i')
7814 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007815 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007816 temp = formatlong(v, flags, prec, c);
7817 if (!temp)
7818 goto onError;
7819 pbuf = PyUnicode_AS_UNICODE(temp);
7820 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007821 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007823 else {
7824 pbuf = formatbuf;
7825 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7826 flags, prec, c, v);
7827 if (len < 0)
7828 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007829 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007830 }
7831 if (flags & F_ZERO)
7832 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833 break;
7834
7835 case 'e':
7836 case 'E':
7837 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007838 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 case 'g':
7840 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007841 if (c == 'F')
7842 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007843 pbuf = formatbuf;
7844 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7845 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846 if (len < 0)
7847 goto onError;
7848 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007849 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850 fill = '0';
7851 break;
7852
7853 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007854 pbuf = formatbuf;
7855 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 if (len < 0)
7857 goto onError;
7858 break;
7859
7860 default:
7861 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007862 "unsupported format character '%c' (0x%x) "
Armin Rigo4b63c212006-10-04 11:44:06 +00007863 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007864 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007865 (int)c,
Armin Rigo4b63c212006-10-04 11:44:06 +00007866 (Py_ssize_t)(fmt - 1 -
7867 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 goto onError;
7869 }
7870 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007871 if (*pbuf == '-' || *pbuf == '+') {
7872 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873 len--;
7874 }
7875 else if (flags & F_SIGN)
7876 sign = '+';
7877 else if (flags & F_BLANK)
7878 sign = ' ';
7879 else
7880 sign = 0;
7881 }
7882 if (width < len)
7883 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007884 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007885 reslen -= rescnt;
7886 rescnt = width + fmtcnt + 100;
7887 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007888 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007889 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007890 PyErr_NoMemory();
7891 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007892 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007893 if (_PyUnicode_Resize(&result, reslen) < 0) {
7894 Py_XDECREF(temp);
7895 goto onError;
7896 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897 res = PyUnicode_AS_UNICODE(result)
7898 + reslen - rescnt;
7899 }
7900 if (sign) {
7901 if (fill != ' ')
7902 *res++ = sign;
7903 rescnt--;
7904 if (width > len)
7905 width--;
7906 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007907 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7908 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007909 assert(pbuf[1] == c);
7910 if (fill != ' ') {
7911 *res++ = *pbuf++;
7912 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007913 }
Tim Petersfff53252001-04-12 18:38:48 +00007914 rescnt -= 2;
7915 width -= 2;
7916 if (width < 0)
7917 width = 0;
7918 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920 if (width > len && !(flags & F_LJUST)) {
7921 do {
7922 --rescnt;
7923 *res++ = fill;
7924 } while (--width > len);
7925 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007926 if (fill == ' ') {
7927 if (sign)
7928 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007929 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007930 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007931 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007932 *res++ = *pbuf++;
7933 *res++ = *pbuf++;
7934 }
7935 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007936 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007937 res += len;
7938 rescnt -= len;
7939 while (--width >= len) {
7940 --rescnt;
7941 *res++ = ' ';
7942 }
7943 if (dict && (argidx < arglen) && c != '%') {
7944 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007945 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007946 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947 goto onError;
7948 }
7949 Py_XDECREF(temp);
7950 } /* '%' */
7951 } /* until end */
7952 if (argidx < arglen && !dict) {
7953 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007954 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955 goto onError;
7956 }
7957
Thomas Woutersa96affe2006-03-12 00:29:36 +00007958 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7959 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960 if (args_owned) {
7961 Py_DECREF(args);
7962 }
7963 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964 return (PyObject *)result;
7965
7966 onError:
7967 Py_XDECREF(result);
7968 Py_DECREF(uformat);
7969 if (args_owned) {
7970 Py_DECREF(args);
7971 }
7972 return NULL;
7973}
7974
7975static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007976 (readbufferproc) unicode_buffer_getreadbuf,
7977 (writebufferproc) unicode_buffer_getwritebuf,
7978 (segcountproc) unicode_buffer_getsegcount,
7979 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980};
7981
Jeremy Hylton938ace62002-07-17 16:30:39 +00007982static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007983unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7984
Tim Peters6d6c1a32001-08-02 04:15:00 +00007985static PyObject *
7986unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7987{
7988 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007989 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007990 char *encoding = NULL;
7991 char *errors = NULL;
7992
Guido van Rossume023fe02001-08-30 03:12:59 +00007993 if (type != &PyUnicode_Type)
7994 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007995 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7996 kwlist, &x, &encoding, &errors))
7997 return NULL;
7998 if (x == NULL)
7999 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008000 if (encoding == NULL && errors == NULL)
8001 return PyObject_Unicode(x);
8002 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008003 return PyUnicode_FromEncodedObject(x, encoding, errors);
8004}
8005
Guido van Rossume023fe02001-08-30 03:12:59 +00008006static PyObject *
8007unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8008{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008009 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008010 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008011
8012 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8013 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8014 if (tmp == NULL)
8015 return NULL;
8016 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008017 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008018 if (pnew == NULL) {
8019 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008020 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008021 }
Neal Norwitzb3635f92008-03-18 04:17:36 +00008022 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008023 if (pnew->str == NULL) {
8024 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008025 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008026 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008027 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008028 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008029 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8030 pnew->length = n;
8031 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008032 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008033 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008034}
8035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008036PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008037"unicode(string [, encoding[, errors]]) -> object\n\
8038\n\
8039Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008040encoding defaults to the current default string encoding.\n\
8041errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008042
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043PyTypeObject PyUnicode_Type = {
8044 PyObject_HEAD_INIT(&PyType_Type)
8045 0, /* ob_size */
8046 "unicode", /* tp_name */
8047 sizeof(PyUnicodeObject), /* tp_size */
8048 0, /* tp_itemsize */
8049 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008050 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008052 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008054 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008055 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008056 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008058 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059 (hashfunc) unicode_hash, /* tp_hash*/
8060 0, /* tp_call*/
8061 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008062 PyObject_GenericGetAttr, /* tp_getattro */
8063 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008065 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8066 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008067 unicode_doc, /* tp_doc */
8068 0, /* tp_traverse */
8069 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008070 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008071 0, /* tp_weaklistoffset */
8072 0, /* tp_iter */
8073 0, /* tp_iternext */
8074 unicode_methods, /* tp_methods */
8075 0, /* tp_members */
8076 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008077 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008078 0, /* tp_dict */
8079 0, /* tp_descr_get */
8080 0, /* tp_descr_set */
8081 0, /* tp_dictoffset */
8082 0, /* tp_init */
8083 0, /* tp_alloc */
8084 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008085 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086};
8087
8088/* Initialize the Unicode implementation */
8089
Thomas Wouters78890102000-07-22 19:25:51 +00008090void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008092 int i;
8093
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008094 /* XXX - move this array to unicodectype.c ? */
8095 Py_UNICODE linebreak[] = {
8096 0x000A, /* LINE FEED */
8097 0x000D, /* CARRIAGE RETURN */
8098 0x001C, /* FILE SEPARATOR */
8099 0x001D, /* GROUP SEPARATOR */
8100 0x001E, /* RECORD SEPARATOR */
8101 0x0085, /* NEXT LINE */
8102 0x2028, /* LINE SEPARATOR */
8103 0x2029, /* PARAGRAPH SEPARATOR */
8104 };
8105
Fred Drakee4315f52000-05-09 19:53:39 +00008106 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008107 unicode_freelist = NULL;
8108 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008110 if (!unicode_empty)
8111 return;
8112
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008113 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008114 for (i = 0; i < 256; i++)
8115 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008116 if (PyType_Ready(&PyUnicode_Type) < 0)
8117 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008118
8119 /* initialize the linebreak bloom filter */
8120 bloom_linebreak = make_bloom_mask(
8121 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8122 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008123
8124 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125}
8126
8127/* Finalize the Unicode implementation */
8128
8129void
Thomas Wouters78890102000-07-22 19:25:51 +00008130_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008132 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008133 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008135 Py_XDECREF(unicode_empty);
8136 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008137
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008138 for (i = 0; i < 256; i++) {
8139 if (unicode_latin1[i]) {
8140 Py_DECREF(unicode_latin1[i]);
8141 unicode_latin1[i] = NULL;
8142 }
8143 }
8144
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008145 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 PyUnicodeObject *v = u;
8147 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008148 if (v->str)
Neal Norwitzb3635f92008-03-18 04:17:36 +00008149 PyObject_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008150 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008151 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008153 unicode_freelist = NULL;
8154 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008156
Anthony Baxterac6bd462006-04-13 02:06:09 +00008157#ifdef __cplusplus
8158}
8159#endif
8160
8161
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008162/*
8163Local variables:
8164c-basic-offset: 4
8165indent-tabs-mode: nil
8166End:
8167*/