blob: af6f67a18b2f7c45cbe0e81003a592fa3aa36b37 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
Neal Norwitzb3635f92008-03-18 04:17:36 +0000203 unicode->str = PyObject_REALLOC(unicode->str,
204 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000206 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 PyErr_NoMemory();
208 return -1;
209 }
210 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000211 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000213 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000215 if (unicode->defenc) {
216 Py_DECREF(unicode->defenc);
217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 }
219 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000220
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 return 0;
222}
223
224/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000225 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226
227 XXX This allocator could further be enhanced by assuring that the
228 free list never reduces its size below 1.
229
230*/
231
232static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000233PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234{
235 register PyUnicodeObject *unicode;
236
Andrew Dalkee0df7622006-05-27 11:04:36 +0000237 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238 if (length == 0 && unicode_empty != NULL) {
239 Py_INCREF(unicode_empty);
240 return unicode_empty;
241 }
242
Neal Norwitz4f3be8a2008-07-31 17:08:14 +0000243 /* Ensure we won't overflow the size. */
244 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
245 return (PyUnicodeObject *)PyErr_NoMemory();
246 }
247
Guido van Rossumd57fd912000-03-10 22:53:23 +0000248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000258 PyObject_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000263 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
264 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000265 }
266 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268 else {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000269 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000270 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271 if (unicode == NULL)
272 return NULL;
Neal Norwitzb3635f92008-03-18 04:17:36 +0000273 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
274 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 }
276
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 if (!unicode->str) {
278 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000279 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000280 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000281 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000282 * the caller fails before initializing str -- unicode_resize()
283 * reads str[0], and the Keep-Alive optimization can keep memory
284 * allocated for str alive across a call to unicode_dealloc(unicode).
285 * We don't want unicode_resize to read uninitialized memory in
286 * that case.
287 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000288 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000290 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000292 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000294
295 onError:
296 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000297 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000298 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299}
300
301static
Guido van Rossum9475a232001-10-05 20:51:39 +0000302void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000303{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000304 if (PyUnicode_CheckExact(unicode) &&
305 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 /* Keep-Alive optimization */
307 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000308 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 unicode->str = NULL;
310 unicode->length = 0;
311 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000312 if (unicode->defenc) {
313 Py_DECREF(unicode->defenc);
314 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000315 }
316 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000317 *(PyUnicodeObject **)unicode = unicode_freelist;
318 unicode_freelist = unicode;
319 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 }
321 else {
Neal Norwitzb3635f92008-03-18 04:17:36 +0000322 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000323 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000324 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000325 }
326}
327
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329{
330 register PyUnicodeObject *v;
331
332 /* Argument checks */
333 if (unicode == NULL) {
334 PyErr_BadInternalCall();
335 return -1;
336 }
337 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000338 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000339 PyErr_BadInternalCall();
340 return -1;
341 }
342
343 /* Resizing unicode_empty and single character objects is not
344 possible since these are being shared. We simply return a fresh
345 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000346 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000347 (v == unicode_empty || v->length == 1)) {
348 PyUnicodeObject *w = _PyUnicode_New(length);
349 if (w == NULL)
350 return -1;
351 Py_UNICODE_COPY(w->str, v->str,
352 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000353 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000354 *unicode = (PyObject *)w;
355 return 0;
356 }
357
358 /* Note that we don't have to modify *unicode for unshared Unicode
359 objects, since we can modify them in-place. */
360 return unicode_resize(v, length);
361}
362
363/* Internal API for use in unicodeobject.c only ! */
364#define _PyUnicode_Resize(unicodevar, length) \
365 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
366
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000368 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369{
370 PyUnicodeObject *unicode;
371
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000372 /* If the Unicode data is known at construction time, we can apply
373 some optimizations which share commonly used objects. */
374 if (u != NULL) {
375
376 /* Optimization for empty strings */
377 if (size == 0 && unicode_empty != NULL) {
378 Py_INCREF(unicode_empty);
379 return (PyObject *)unicode_empty;
380 }
381
382 /* Single character Unicode objects in the Latin-1 range are
383 shared when using this constructor */
384 if (size == 1 && *u < 256) {
385 unicode = unicode_latin1[*u];
386 if (!unicode) {
387 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000388 if (!unicode)
389 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000390 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391 unicode_latin1[*u] = unicode;
392 }
393 Py_INCREF(unicode);
394 return (PyObject *)unicode;
395 }
396 }
Tim Petersced69f82003-09-16 20:30:58 +0000397
Guido van Rossumd57fd912000-03-10 22:53:23 +0000398 unicode = _PyUnicode_New(size);
399 if (!unicode)
400 return NULL;
401
402 /* Copy the Unicode data into the new object */
403 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000404 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405
406 return (PyObject *)unicode;
407}
408
409#ifdef HAVE_WCHAR_H
410
411PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000412 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413{
414 PyUnicodeObject *unicode;
415
416 if (w == NULL) {
417 PyErr_BadInternalCall();
418 return NULL;
419 }
420
421 unicode = _PyUnicode_New(size);
422 if (!unicode)
423 return NULL;
424
425 /* Copy the wchar_t data into the new object */
426#ifdef HAVE_USABLE_WCHAR_T
427 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000428#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429 {
430 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000431 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000433 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000434 *u++ = *w++;
435 }
436#endif
437
438 return (PyObject *)unicode;
439}
440
Martin v. Löwis18e16552006-02-15 17:27:45 +0000441Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
442 wchar_t *w,
443 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444{
445 if (unicode == NULL) {
446 PyErr_BadInternalCall();
447 return -1;
448 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000449
450 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000452 size = PyUnicode_GET_SIZE(unicode) + 1;
453
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454#ifdef HAVE_USABLE_WCHAR_T
455 memcpy(w, unicode->str, size * sizeof(wchar_t));
456#else
457 {
458 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000459 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000461 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462 *w++ = *u++;
463 }
464#endif
465
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000466 if (size > PyUnicode_GET_SIZE(unicode))
467 return PyUnicode_GET_SIZE(unicode);
468 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 return size;
470}
471
472#endif
473
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000474PyObject *PyUnicode_FromOrdinal(int ordinal)
475{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000476 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000477
478#ifdef Py_UNICODE_WIDE
479 if (ordinal < 0 || ordinal > 0x10ffff) {
480 PyErr_SetString(PyExc_ValueError,
481 "unichr() arg not in range(0x110000) "
482 "(wide Python build)");
483 return NULL;
484 }
485#else
486 if (ordinal < 0 || ordinal > 0xffff) {
487 PyErr_SetString(PyExc_ValueError,
488 "unichr() arg not in range(0x10000) "
489 "(narrow Python build)");
490 return NULL;
491 }
492#endif
493
Hye-Shik Chang40574832004-04-06 07:24:51 +0000494 s[0] = (Py_UNICODE)ordinal;
495 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000496}
497
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498PyObject *PyUnicode_FromObject(register PyObject *obj)
499{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000500 /* XXX Perhaps we should make this API an alias of
501 PyObject_Unicode() instead ?! */
502 if (PyUnicode_CheckExact(obj)) {
503 Py_INCREF(obj);
504 return obj;
505 }
506 if (PyUnicode_Check(obj)) {
507 /* For a Unicode subtype that's not a Unicode object,
508 return a true Unicode object with the same data. */
509 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
510 PyUnicode_GET_SIZE(obj));
511 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
513}
514
515PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
516 const char *encoding,
517 const char *errors)
518{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000519 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000520 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000521 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000522
Guido van Rossumd57fd912000-03-10 22:53:23 +0000523 if (obj == NULL) {
524 PyErr_BadInternalCall();
525 return NULL;
526 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000527
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000528#if 0
529 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000530 that no encodings is given and then redirect to
531 PyObject_Unicode() which then applies the additional logic for
532 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000533
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000534 NOTE: This API should really only be used for object which
535 represent *encoded* Unicode !
536
537 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000538 if (PyUnicode_Check(obj)) {
539 if (encoding) {
540 PyErr_SetString(PyExc_TypeError,
541 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000542 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000543 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000544 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000545 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000546#else
547 if (PyUnicode_Check(obj)) {
548 PyErr_SetString(PyExc_TypeError,
549 "decoding Unicode is not supported");
550 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000551 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000552#endif
553
554 /* Coerce object */
555 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000556 s = PyString_AS_STRING(obj);
557 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000558 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000559 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
560 /* Overwrite the error message with something more useful in
561 case of a TypeError. */
562 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000563 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000564 "coercing to Unicode: need string or buffer, "
565 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000566 obj->ob_type->tp_name);
567 goto onError;
568 }
Tim Petersced69f82003-09-16 20:30:58 +0000569
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000570 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571 if (len == 0) {
572 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000573 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000574 }
Tim Petersced69f82003-09-16 20:30:58 +0000575 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000576 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000577
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000578 return v;
579
580 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000581 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582}
583
584PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000585 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586 const char *encoding,
587 const char *errors)
588{
589 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000590
591 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000592 encoding = PyUnicode_GetDefaultEncoding();
593
594 /* Shortcuts for common default encodings */
595 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000597 else if (strcmp(encoding, "latin-1") == 0)
598 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000599#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
600 else if (strcmp(encoding, "mbcs") == 0)
601 return PyUnicode_DecodeMBCS(s, size, errors);
602#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000603 else if (strcmp(encoding, "ascii") == 0)
604 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605
606 /* Decode via the codec registry */
607 buffer = PyBuffer_FromMemory((void *)s, size);
608 if (buffer == NULL)
609 goto onError;
610 unicode = PyCodec_Decode(buffer, encoding, errors);
611 if (unicode == NULL)
612 goto onError;
613 if (!PyUnicode_Check(unicode)) {
614 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000615 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000616 unicode->ob_type->tp_name);
617 Py_DECREF(unicode);
618 goto onError;
619 }
620 Py_DECREF(buffer);
621 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000622
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623 onError:
624 Py_XDECREF(buffer);
625 return NULL;
626}
627
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000628PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
629 const char *encoding,
630 const char *errors)
631{
632 PyObject *v;
633
634 if (!PyUnicode_Check(unicode)) {
635 PyErr_BadArgument();
636 goto onError;
637 }
638
639 if (encoding == NULL)
640 encoding = PyUnicode_GetDefaultEncoding();
641
642 /* Decode via the codec registry */
643 v = PyCodec_Decode(unicode, encoding, errors);
644 if (v == NULL)
645 goto onError;
646 return v;
647
648 onError:
649 return NULL;
650}
651
Guido van Rossumd57fd912000-03-10 22:53:23 +0000652PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000653 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 const char *encoding,
655 const char *errors)
656{
657 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000658
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 unicode = PyUnicode_FromUnicode(s, size);
660 if (unicode == NULL)
661 return NULL;
662 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
663 Py_DECREF(unicode);
664 return v;
665}
666
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000667PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
668 const char *encoding,
669 const char *errors)
670{
671 PyObject *v;
672
673 if (!PyUnicode_Check(unicode)) {
674 PyErr_BadArgument();
675 goto onError;
676 }
677
678 if (encoding == NULL)
679 encoding = PyUnicode_GetDefaultEncoding();
680
681 /* Encode via the codec registry */
682 v = PyCodec_Encode(unicode, encoding, errors);
683 if (v == NULL)
684 goto onError;
685 return v;
686
687 onError:
688 return NULL;
689}
690
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
692 const char *encoding,
693 const char *errors)
694{
695 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000696
Guido van Rossumd57fd912000-03-10 22:53:23 +0000697 if (!PyUnicode_Check(unicode)) {
698 PyErr_BadArgument();
699 goto onError;
700 }
Fred Drakee4315f52000-05-09 19:53:39 +0000701
Tim Petersced69f82003-09-16 20:30:58 +0000702 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000703 encoding = PyUnicode_GetDefaultEncoding();
704
705 /* Shortcuts for common default encodings */
706 if (errors == NULL) {
707 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000708 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000709 else if (strcmp(encoding, "latin-1") == 0)
710 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000711#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
712 else if (strcmp(encoding, "mbcs") == 0)
713 return PyUnicode_AsMBCSString(unicode);
714#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000715 else if (strcmp(encoding, "ascii") == 0)
716 return PyUnicode_AsASCIIString(unicode);
717 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718
719 /* Encode via the codec registry */
720 v = PyCodec_Encode(unicode, encoding, errors);
721 if (v == NULL)
722 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 if (!PyString_Check(v)) {
724 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000725 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726 v->ob_type->tp_name);
727 Py_DECREF(v);
728 goto onError;
729 }
730 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000731
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 onError:
733 return NULL;
734}
735
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000736PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
737 const char *errors)
738{
739 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
740
741 if (v)
742 return v;
743 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
744 if (v && errors == NULL)
745 ((PyUnicodeObject *)unicode)->defenc = v;
746 return v;
747}
748
Guido van Rossumd57fd912000-03-10 22:53:23 +0000749Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
750{
751 if (!PyUnicode_Check(unicode)) {
752 PyErr_BadArgument();
753 goto onError;
754 }
755 return PyUnicode_AS_UNICODE(unicode);
756
757 onError:
758 return NULL;
759}
760
Martin v. Löwis18e16552006-02-15 17:27:45 +0000761Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762{
763 if (!PyUnicode_Check(unicode)) {
764 PyErr_BadArgument();
765 goto onError;
766 }
767 return PyUnicode_GET_SIZE(unicode);
768
769 onError:
770 return -1;
771}
772
Thomas Wouters78890102000-07-22 19:25:51 +0000773const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000774{
775 return unicode_default_encoding;
776}
777
778int PyUnicode_SetDefaultEncoding(const char *encoding)
779{
780 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000781
Fred Drakee4315f52000-05-09 19:53:39 +0000782 /* Make sure the encoding is valid. As side effect, this also
783 loads the encoding into the codec registry cache. */
784 v = _PyCodec_Lookup(encoding);
785 if (v == NULL)
786 goto onError;
787 Py_DECREF(v);
788 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000789 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000790 sizeof(unicode_default_encoding));
791 return 0;
792
793 onError:
794 return -1;
795}
796
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000797/* error handling callback helper:
798 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000799 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800 and adjust various state variables.
801 return 0 on success, -1 on error
802*/
803
804static
805int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
806 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000807 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
808 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000809{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000810 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000811
812 PyObject *restuple = NULL;
813 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000814 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
815 Py_ssize_t requiredsize;
816 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000817 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000818 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000819 int res = -1;
820
821 if (*errorHandler == NULL) {
822 *errorHandler = PyCodec_LookupError(errors);
823 if (*errorHandler == NULL)
824 goto onError;
825 }
826
827 if (*exceptionObject == NULL) {
828 *exceptionObject = PyUnicodeDecodeError_Create(
829 encoding, input, insize, *startinpos, *endinpos, reason);
830 if (*exceptionObject == NULL)
831 goto onError;
832 }
833 else {
834 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
835 goto onError;
836 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
837 goto onError;
838 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
839 goto onError;
840 }
841
842 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
843 if (restuple == NULL)
844 goto onError;
845 if (!PyTuple_Check(restuple)) {
846 PyErr_Format(PyExc_TypeError, &argparse[4]);
847 goto onError;
848 }
849 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
850 goto onError;
851 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000852 newpos = insize+newpos;
853 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000854 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000855 goto onError;
856 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000857
858 /* need more space? (at least enough for what we
859 have+the replacement+the rest of the string (starting
860 at the new input position), so we won't have to check space
861 when there are no errors in the rest of the string) */
862 repptr = PyUnicode_AS_UNICODE(repunicode);
863 repsize = PyUnicode_GET_SIZE(repunicode);
864 requiredsize = *outpos + repsize + insize-newpos;
865 if (requiredsize > outsize) {
866 if (requiredsize<2*outsize)
867 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000868 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000869 goto onError;
870 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
871 }
872 *endinpos = newpos;
873 *inptr = input + newpos;
874 Py_UNICODE_COPY(*outptr, repptr, repsize);
875 *outptr += repsize;
876 *outpos += repsize;
877 /* we made it! */
878 res = 0;
879
880 onError:
881 Py_XDECREF(restuple);
882 return res;
883}
884
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000885/* --- UTF-7 Codec -------------------------------------------------------- */
886
887/* see RFC2152 for details */
888
Tim Petersced69f82003-09-16 20:30:58 +0000889static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000890char utf7_special[128] = {
891 /* indicate whether a UTF-7 character is special i.e. cannot be directly
892 encoded:
893 0 - not special
894 1 - special
895 2 - whitespace (optional)
896 3 - RFC2152 Set O (optional) */
897 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
898 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
899 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
901 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
902 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
903 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
904 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
905
906};
907
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000908/* Note: The comparison (c) <= 0 is a trick to work-around gcc
909 warnings about the comparison always being false; since
910 utf7_special[0] is 1, we can safely make that one comparison
911 true */
912
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000913#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000914 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000915 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916 (encodeO && (utf7_special[(c)] == 3)))
917
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000918#define B64(n) \
919 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
920#define B64CHAR(c) \
921 (isalnum(c) || (c) == '+' || (c) == '/')
922#define UB64(c) \
923 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
924 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000925
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000926#define ENCODE(out, ch, bits) \
927 while (bits >= 6) { \
928 *out++ = B64(ch >> (bits-6)); \
929 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000930 }
931
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000932#define DECODE(out, ch, bits, surrogate) \
933 while (bits >= 16) { \
934 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
935 bits -= 16; \
936 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000937 /* We have already generated an error for the high surrogate \
938 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000939 surrogate = 0; \
940 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000942 it in a 16-bit character */ \
943 surrogate = 1; \
944 errmsg = "code pairs are not supported"; \
945 goto utf7Error; \
946 } else { \
947 *out++ = outCh; \
948 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000949 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000951PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000952 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000953 const char *errors)
954{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000955 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000956 Py_ssize_t startinpos;
957 Py_ssize_t endinpos;
958 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000959 const char *e;
960 PyUnicodeObject *unicode;
961 Py_UNICODE *p;
962 const char *errmsg = "";
963 int inShift = 0;
964 unsigned int bitsleft = 0;
965 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000966 int surrogate = 0;
967 PyObject *errorHandler = NULL;
968 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000969
970 unicode = _PyUnicode_New(size);
971 if (!unicode)
972 return NULL;
973 if (size == 0)
974 return (PyObject *)unicode;
975
976 p = unicode->str;
977 e = s + size;
978
979 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000980 Py_UNICODE ch;
981 restart:
Antoine Pitrouc8e4bed2008-07-25 19:00:48 +0000982 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000983
984 if (inShift) {
985 if ((ch == '-') || !B64CHAR(ch)) {
986 inShift = 0;
987 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000988
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000989 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
990 if (bitsleft >= 6) {
991 /* The shift sequence has a partial character in it. If
992 bitsleft < 6 then we could just classify it as padding
993 but that is not the case here */
994
995 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000996 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000999 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001000 here so indicate the potential of a misencoded character. */
1001
1002 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1003 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1004 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001005 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001006 }
1007
1008 if (ch == '-') {
1009 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001010 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 inShift = 1;
1012 }
1013 } else if (SPECIAL(ch,0,0)) {
1014 errmsg = "unexpected special character";
Guido van Rossumc261e482009-03-05 21:47:33 +00001015 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001016 } else {
1017 *p++ = ch;
1018 }
1019 } else {
1020 charsleft = (charsleft << 6) | UB64(ch);
1021 bitsleft += 6;
1022 s++;
1023 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1024 }
1025 }
1026 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001027 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001028 s++;
1029 if (s < e && *s == '-') {
1030 s++;
1031 *p++ = '+';
1032 } else
1033 {
1034 inShift = 1;
1035 bitsleft = 0;
1036 }
1037 }
1038 else if (SPECIAL(ch,0,0)) {
Guido van Rossumc261e482009-03-05 21:47:33 +00001039 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001040 errmsg = "unexpected special character";
1041 s++;
Guido van Rossumc261e482009-03-05 21:47:33 +00001042 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001043 }
1044 else {
1045 *p++ = ch;
1046 s++;
1047 }
1048 continue;
1049 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001050 outpos = p-PyUnicode_AS_UNICODE(unicode);
1051 endinpos = s-starts;
1052 if (unicode_decode_call_errorhandler(
1053 errors, &errorHandler,
1054 "utf7", errmsg,
1055 starts, size, &startinpos, &endinpos, &exc, &s,
1056 (PyObject **)&unicode, &outpos, &p))
1057 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 }
1059
1060 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001061 outpos = p-PyUnicode_AS_UNICODE(unicode);
1062 endinpos = size;
1063 if (unicode_decode_call_errorhandler(
1064 errors, &errorHandler,
1065 "utf7", "unterminated shift sequence",
1066 starts, size, &startinpos, &endinpos, &exc, &s,
1067 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001069 if (s < e)
1070 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001071 }
1072
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001073 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001074 goto onError;
1075
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001076 Py_XDECREF(errorHandler);
1077 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001078 return (PyObject *)unicode;
1079
1080onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001081 Py_XDECREF(errorHandler);
1082 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001083 Py_DECREF(unicode);
1084 return NULL;
1085}
1086
1087
1088PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001089 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001090 int encodeSetO,
1091 int encodeWhiteSpace,
1092 const char *errors)
1093{
1094 PyObject *v;
1095 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001096 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001097 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001098 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001099 unsigned int bitsleft = 0;
1100 unsigned long charsleft = 0;
1101 char * out;
1102 char * start;
1103
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00001104 if (cbAllocated / 5 != size)
1105 return PyErr_NoMemory();
1106
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001107 if (size == 0)
1108 return PyString_FromStringAndSize(NULL, 0);
1109
1110 v = PyString_FromStringAndSize(NULL, cbAllocated);
1111 if (v == NULL)
1112 return NULL;
1113
1114 start = out = PyString_AS_STRING(v);
1115 for (;i < size; ++i) {
1116 Py_UNICODE ch = s[i];
1117
1118 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001119 if (ch == '+') {
1120 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001121 *out++ = '-';
1122 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1123 charsleft = ch;
1124 bitsleft = 16;
1125 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001126 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001127 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001128 } else {
1129 *out++ = (char) ch;
1130 }
1131 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001132 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1133 *out++ = B64(charsleft << (6-bitsleft));
1134 charsleft = 0;
1135 bitsleft = 0;
1136 /* Characters not in the BASE64 set implicitly unshift the sequence
1137 so no '-' is required, except if the character is itself a '-' */
1138 if (B64CHAR(ch) || ch == '-') {
1139 *out++ = '-';
1140 }
1141 inShift = 0;
1142 *out++ = (char) ch;
1143 } else {
1144 bitsleft += 16;
1145 charsleft = (charsleft << 16) | ch;
1146 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1147
1148 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001149 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001150 or '-' then the shift sequence will be terminated implicitly and we
1151 don't have to insert a '-'. */
1152
1153 if (bitsleft == 0) {
1154 if (i + 1 < size) {
1155 Py_UNICODE ch2 = s[i+1];
1156
1157 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001158
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001159 } else if (B64CHAR(ch2) || ch2 == '-') {
1160 *out++ = '-';
1161 inShift = 0;
1162 } else {
1163 inShift = 0;
1164 }
1165
1166 }
1167 else {
1168 *out++ = '-';
1169 inShift = 0;
1170 }
1171 }
Tim Petersced69f82003-09-16 20:30:58 +00001172 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001173 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001174 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001175 if (bitsleft) {
1176 *out++= B64(charsleft << (6-bitsleft) );
1177 *out++ = '-';
1178 }
1179
Tim Peters5de98422002-04-27 18:44:32 +00001180 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001181 return v;
1182}
1183
1184#undef SPECIAL
1185#undef B64
1186#undef B64CHAR
1187#undef UB64
1188#undef ENCODE
1189#undef DECODE
1190
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191/* --- UTF-8 Codec -------------------------------------------------------- */
1192
Tim Petersced69f82003-09-16 20:30:58 +00001193static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194char utf8_code_length[256] = {
1195 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1196 illegal prefix. see RFC 2279 for details */
1197 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1198 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1199 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1200 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1201 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1202 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1203 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1204 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1205 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1206 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1209 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1210 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1211 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1212 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1213};
1214
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 const char *errors)
1218{
Walter Dörwald69652032004-09-07 20:24:22 +00001219 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1220}
1221
1222PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001223 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001224 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001225 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001226{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001227 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001229 Py_ssize_t startinpos;
1230 Py_ssize_t endinpos;
1231 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 const char *e;
1233 PyUnicodeObject *unicode;
1234 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001235 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001236 PyObject *errorHandler = NULL;
1237 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238
1239 /* Note: size will always be longer than the resulting Unicode
1240 character count */
1241 unicode = _PyUnicode_New(size);
1242 if (!unicode)
1243 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001244 if (size == 0) {
1245 if (consumed)
1246 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249
1250 /* Unpack UTF-8 encoded data */
1251 p = unicode->str;
1252 e = s + size;
1253
1254 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001255 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256
1257 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001258 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259 s++;
1260 continue;
1261 }
1262
1263 n = utf8_code_length[ch];
1264
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001265 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001266 if (consumed)
1267 break;
1268 else {
1269 errmsg = "unexpected end of data";
1270 startinpos = s-starts;
1271 endinpos = size;
1272 goto utf8Error;
1273 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001274 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275
1276 switch (n) {
1277
1278 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001279 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283
1284 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
1290 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001291 if ((s[1] & 0xc0) != 0x80) {
1292 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001293 startinpos = s-starts;
1294 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001295 goto utf8Error;
1296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001298 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 errmsg = "illegal encoding";
1302 goto utf8Error;
1303 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001305 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306 break;
1307
1308 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001309 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001310 (s[2] & 0xc0) != 0x80) {
1311 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001312 startinpos = s-starts;
1313 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001314 goto utf8Error;
1315 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001317 if (ch < 0x0800) {
1318 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001319 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001320
1321 XXX For wide builds (UCS-4) we should probably try
1322 to recombine the surrogates into a single code
1323 unit.
1324 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001325 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001331 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001332 break;
1333
1334 case 4:
1335 if ((s[1] & 0xc0) != 0x80 ||
1336 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001337 (s[3] & 0xc0) != 0x80) {
1338 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001343 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1344 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1345 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001346 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001347 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001348 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001349 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001350 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001351 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001352 startinpos = s-starts;
1353 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001354 goto utf8Error;
1355 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001356#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001357 *p++ = (Py_UNICODE)ch;
1358#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001359 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001360
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001361 /* translate from 10000..10FFFF to 0..FFFF */
1362 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001363
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001364 /* high surrogate = top 10 bits added to D800 */
1365 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001366
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001367 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001368 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001369#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 break;
1371
1372 default:
1373 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001374 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375 startinpos = s-starts;
1376 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001377 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378 }
1379 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001380 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001381
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001382 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001383 outpos = p-PyUnicode_AS_UNICODE(unicode);
1384 if (unicode_decode_call_errorhandler(
1385 errors, &errorHandler,
1386 "utf8", errmsg,
1387 starts, size, &startinpos, &endinpos, &exc, &s,
1388 (PyObject **)&unicode, &outpos, &p))
1389 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 }
Walter Dörwald69652032004-09-07 20:24:22 +00001391 if (consumed)
1392 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393
1394 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001395 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 goto onError;
1397
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001398 Py_XDECREF(errorHandler);
1399 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001400 return (PyObject *)unicode;
1401
1402onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403 Py_XDECREF(errorHandler);
1404 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405 Py_DECREF(unicode);
1406 return NULL;
1407}
1408
Tim Peters602f7402002-04-27 18:03:26 +00001409/* Allocation strategy: if the string is short, convert into a stack buffer
1410 and allocate exactly as much space needed at the end. Else allocate the
1411 maximum possible needed (4 result bytes per Unicode character), and return
1412 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001413*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001414PyObject *
1415PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001416 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001417 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418{
Tim Peters602f7402002-04-27 18:03:26 +00001419#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001420
Martin v. Löwis18e16552006-02-15 17:27:45 +00001421 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001422 PyObject *v; /* result string object */
1423 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001424 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001425 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001426 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001427
Tim Peters602f7402002-04-27 18:03:26 +00001428 assert(s != NULL);
1429 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430
Tim Peters602f7402002-04-27 18:03:26 +00001431 if (size <= MAX_SHORT_UNICHARS) {
1432 /* Write into the stack buffer; nallocated can't overflow.
1433 * At the end, we'll allocate exactly as much heap space as it
1434 * turns out we need.
1435 */
1436 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1437 v = NULL; /* will allocate after we're done */
1438 p = stackbuf;
1439 }
1440 else {
1441 /* Overallocate on the heap, and give the excess back at the end. */
1442 nallocated = size * 4;
1443 if (nallocated / 4 != size) /* overflow! */
1444 return PyErr_NoMemory();
1445 v = PyString_FromStringAndSize(NULL, nallocated);
1446 if (v == NULL)
1447 return NULL;
1448 p = PyString_AS_STRING(v);
1449 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001450
Tim Peters602f7402002-04-27 18:03:26 +00001451 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001452 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001453
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001454 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001455 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001457
Guido van Rossumd57fd912000-03-10 22:53:23 +00001458 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001459 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001460 *p++ = (char)(0xc0 | (ch >> 6));
1461 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001463 else {
Tim Peters602f7402002-04-27 18:03:26 +00001464 /* Encode UCS2 Unicode ordinals */
1465 if (ch < 0x10000) {
1466 /* Special case: check for high surrogate */
1467 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1468 Py_UCS4 ch2 = s[i];
1469 /* Check for low surrogate and combine the two to
1470 form a UCS4 value */
1471 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001472 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001473 i++;
1474 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001475 }
Tim Peters602f7402002-04-27 18:03:26 +00001476 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001477 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001478 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001479 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1480 *p++ = (char)(0x80 | (ch & 0x3f));
1481 continue;
1482 }
1483encodeUCS4:
1484 /* Encode UCS4 Unicode ordinals */
1485 *p++ = (char)(0xf0 | (ch >> 18));
1486 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1487 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1488 *p++ = (char)(0x80 | (ch & 0x3f));
1489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001490 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001491
Tim Peters602f7402002-04-27 18:03:26 +00001492 if (v == NULL) {
1493 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001494 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001495 assert(nneeded <= nallocated);
1496 v = PyString_FromStringAndSize(stackbuf, nneeded);
1497 }
1498 else {
1499 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001500 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001501 assert(nneeded <= nallocated);
1502 _PyString_Resize(&v, nneeded);
1503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001505
Tim Peters602f7402002-04-27 18:03:26 +00001506#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507}
1508
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1510{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511 if (!PyUnicode_Check(unicode)) {
1512 PyErr_BadArgument();
1513 return NULL;
1514 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001515 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1516 PyUnicode_GET_SIZE(unicode),
1517 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518}
1519
1520/* --- UTF-16 Codec ------------------------------------------------------- */
1521
Tim Peters772747b2001-08-09 22:21:55 +00001522PyObject *
1523PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001524 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001525 const char *errors,
1526 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527{
Walter Dörwald69652032004-09-07 20:24:22 +00001528 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1529}
1530
1531PyObject *
1532PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001533 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001534 const char *errors,
1535 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001536 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001538 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001539 Py_ssize_t startinpos;
1540 Py_ssize_t endinpos;
1541 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 PyUnicodeObject *unicode;
1543 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001544 const unsigned char *q, *e;
1545 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001546 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001547 /* Offsets from q for retrieving byte pairs in the right order. */
1548#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1549 int ihi = 1, ilo = 0;
1550#else
1551 int ihi = 0, ilo = 1;
1552#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001553 PyObject *errorHandler = NULL;
1554 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 /* Note: size will always be longer than the resulting Unicode
1557 character count */
1558 unicode = _PyUnicode_New(size);
1559 if (!unicode)
1560 return NULL;
1561 if (size == 0)
1562 return (PyObject *)unicode;
1563
1564 /* Unpack UTF-16 encoded data */
1565 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001566 q = (unsigned char *)s;
1567 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568
1569 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001570 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001572 /* Check for BOM marks (U+FEFF) in the input and adjust current
1573 byte order setting accordingly. In native mode, the leading BOM
1574 mark is skipped, in all other modes, it is copied to the output
1575 stream as-is (giving a ZWNBSP character). */
1576 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001577 if (size >= 2) {
1578 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001579#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001580 if (bom == 0xFEFF) {
1581 q += 2;
1582 bo = -1;
1583 }
1584 else if (bom == 0xFFFE) {
1585 q += 2;
1586 bo = 1;
1587 }
Tim Petersced69f82003-09-16 20:30:58 +00001588#else
Walter Dörwald69652032004-09-07 20:24:22 +00001589 if (bom == 0xFEFF) {
1590 q += 2;
1591 bo = 1;
1592 }
1593 else if (bom == 0xFFFE) {
1594 q += 2;
1595 bo = -1;
1596 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001597#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001598 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600
Tim Peters772747b2001-08-09 22:21:55 +00001601 if (bo == -1) {
1602 /* force LE */
1603 ihi = 1;
1604 ilo = 0;
1605 }
1606 else if (bo == 1) {
1607 /* force BE */
1608 ihi = 0;
1609 ilo = 1;
1610 }
1611
1612 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001613 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001614 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001615 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001616 if (consumed)
1617 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001618 errmsg = "truncated data";
1619 startinpos = ((const char *)q)-starts;
1620 endinpos = ((const char *)e)-starts;
1621 goto utf16Error;
1622 /* The remaining input chars are ignored if the callback
1623 chooses to skip the input */
1624 }
1625 ch = (q[ihi] << 8) | q[ilo];
1626
Tim Peters772747b2001-08-09 22:21:55 +00001627 q += 2;
1628
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 if (ch < 0xD800 || ch > 0xDFFF) {
1630 *p++ = ch;
1631 continue;
1632 }
1633
1634 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001635 if (q >= e) {
1636 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001637 startinpos = (((const char *)q)-2)-starts;
1638 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001639 goto utf16Error;
1640 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001641 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001642 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1643 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001644 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001645#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001646 *p++ = ch;
1647 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001648#else
1649 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001650#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001651 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001652 }
1653 else {
1654 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001655 startinpos = (((const char *)q)-4)-starts;
1656 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001657 goto utf16Error;
1658 }
1659
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001661 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001662 startinpos = (((const char *)q)-2)-starts;
1663 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001664 /* Fall through to report the error */
1665
1666 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001667 outpos = p-PyUnicode_AS_UNICODE(unicode);
1668 if (unicode_decode_call_errorhandler(
1669 errors, &errorHandler,
1670 "utf16", errmsg,
1671 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1672 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001673 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 }
1675
1676 if (byteorder)
1677 *byteorder = bo;
1678
Walter Dörwald69652032004-09-07 20:24:22 +00001679 if (consumed)
1680 *consumed = (const char *)q-starts;
1681
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001683 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 goto onError;
1685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001686 Py_XDECREF(errorHandler);
1687 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 return (PyObject *)unicode;
1689
1690onError:
1691 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001692 Py_XDECREF(errorHandler);
1693 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 return NULL;
1695}
1696
Tim Peters772747b2001-08-09 22:21:55 +00001697PyObject *
1698PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001699 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001700 const char *errors,
1701 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702{
1703 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001704 unsigned char *p;
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00001705 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001706#ifdef Py_UNICODE_WIDE
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00001707 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001708#else
1709 const int pairs = 0;
1710#endif
Tim Peters772747b2001-08-09 22:21:55 +00001711 /* Offsets from p for storing byte pairs in the right order. */
1712#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1713 int ihi = 1, ilo = 0;
1714#else
1715 int ihi = 0, ilo = 1;
1716#endif
1717
1718#define STORECHAR(CH) \
1719 do { \
1720 p[ihi] = ((CH) >> 8) & 0xff; \
1721 p[ilo] = (CH) & 0xff; \
1722 p += 2; \
1723 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001725#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001726 for (i = pairs = 0; i < size; i++)
1727 if (s[i] >= 0x10000)
1728 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001729#endif
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00001730 /* 2 * (size + pairs + (byteorder == 0)) */
1731 if (size > PY_SSIZE_T_MAX ||
1732 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
1733 return PyErr_NoMemory();
1734 nsize = (size + pairs + (byteorder == 0));
1735 bytesize = nsize * 2;
1736 if (bytesize / 2 != nsize)
1737 return PyErr_NoMemory();
1738 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 if (v == NULL)
1740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741
Tim Peters772747b2001-08-09 22:21:55 +00001742 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001744 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001745 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001746 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001747
1748 if (byteorder == -1) {
1749 /* force LE */
1750 ihi = 1;
1751 ilo = 0;
1752 }
1753 else if (byteorder == 1) {
1754 /* force BE */
1755 ihi = 0;
1756 ilo = 1;
1757 }
1758
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001759 while (size-- > 0) {
1760 Py_UNICODE ch = *s++;
1761 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001762#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001763 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001764 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1765 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001767#endif
Tim Peters772747b2001-08-09 22:21:55 +00001768 STORECHAR(ch);
1769 if (ch2)
1770 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001773#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774}
1775
1776PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1777{
1778 if (!PyUnicode_Check(unicode)) {
1779 PyErr_BadArgument();
1780 return NULL;
1781 }
1782 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1783 PyUnicode_GET_SIZE(unicode),
1784 NULL,
1785 0);
1786}
1787
1788/* --- Unicode Escape Codec ----------------------------------------------- */
1789
Fredrik Lundh06d12682001-01-24 07:59:11 +00001790static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001791
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001793 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 const char *errors)
1795{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001796 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001797 Py_ssize_t startinpos;
1798 Py_ssize_t endinpos;
1799 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001800 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001802 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001804 char* message;
1805 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806 PyObject *errorHandler = NULL;
1807 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001808
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 /* Escaped strings will always be longer than the resulting
1810 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001811 length after conversion to the true value.
1812 (but if the error callback returns a long replacement string
1813 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 v = _PyUnicode_New(size);
1815 if (v == NULL)
1816 goto onError;
1817 if (size == 0)
1818 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001819
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001820 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823 while (s < end) {
1824 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001825 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827
1828 /* Non-escape characters are interpreted as Unicode ordinals */
1829 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001830 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831 continue;
1832 }
1833
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001834 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 /* \ - Escapes */
1836 s++;
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001837 c = *s++;
1838 if (s > end)
1839 c = '\0'; /* Invalid after \ */
1840 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841
1842 /* \x escapes */
1843 case '\n': break;
1844 case '\\': *p++ = '\\'; break;
1845 case '\'': *p++ = '\''; break;
1846 case '\"': *p++ = '\"'; break;
1847 case 'b': *p++ = '\b'; break;
1848 case 'f': *p++ = '\014'; break; /* FF */
1849 case 't': *p++ = '\t'; break;
1850 case 'n': *p++ = '\n'; break;
1851 case 'r': *p++ = '\r'; break;
1852 case 'v': *p++ = '\013'; break; /* VT */
1853 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1854
1855 /* \OOO (octal) escapes */
1856 case '0': case '1': case '2': case '3':
1857 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001858 x = s[-1] - '0';
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001859 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001860 x = (x<<3) + *s++ - '0';
Georg Brandl1dcb9c92007-11-02 22:46:38 +00001861 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001862 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001864 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 break;
1866
Fredrik Lundhccc74732001-02-18 22:13:49 +00001867 /* hex escapes */
1868 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001870 digits = 2;
1871 message = "truncated \\xXX escape";
1872 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873
Fredrik Lundhccc74732001-02-18 22:13:49 +00001874 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001876 digits = 4;
1877 message = "truncated \\uXXXX escape";
1878 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879
Fredrik Lundhccc74732001-02-18 22:13:49 +00001880 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001881 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001882 digits = 8;
1883 message = "truncated \\UXXXXXXXX escape";
1884 hexescape:
1885 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001886 outpos = p-PyUnicode_AS_UNICODE(v);
1887 if (s+digits>end) {
1888 endinpos = size;
1889 if (unicode_decode_call_errorhandler(
1890 errors, &errorHandler,
1891 "unicodeescape", "end of string in escape sequence",
1892 starts, size, &startinpos, &endinpos, &exc, &s,
1893 (PyObject **)&v, &outpos, &p))
1894 goto onError;
1895 goto nextByte;
1896 }
1897 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001898 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001899 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001900 endinpos = (s+i+1)-starts;
1901 if (unicode_decode_call_errorhandler(
1902 errors, &errorHandler,
1903 "unicodeescape", message,
1904 starts, size, &startinpos, &endinpos, &exc, &s,
1905 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001906 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001907 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001908 }
1909 chr = (chr<<4) & ~0xF;
1910 if (c >= '0' && c <= '9')
1911 chr += c - '0';
1912 else if (c >= 'a' && c <= 'f')
1913 chr += 10 + c - 'a';
1914 else
1915 chr += 10 + c - 'A';
1916 }
1917 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001918 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001919 /* _decoding_error will have already written into the
1920 target buffer. */
1921 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001922 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001923 /* when we get here, chr is a 32-bit unicode character */
1924 if (chr <= 0xffff)
1925 /* UCS-2 character */
1926 *p++ = (Py_UNICODE) chr;
1927 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001928 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001929 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001930#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001931 *p++ = chr;
1932#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001933 chr -= 0x10000L;
1934 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001935 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001936#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001937 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001938 endinpos = s-starts;
1939 outpos = p-PyUnicode_AS_UNICODE(v);
1940 if (unicode_decode_call_errorhandler(
1941 errors, &errorHandler,
1942 "unicodeescape", "illegal Unicode character",
1943 starts, size, &startinpos, &endinpos, &exc, &s,
1944 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001945 goto onError;
1946 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001947 break;
1948
1949 /* \N{name} */
1950 case 'N':
1951 message = "malformed \\N character escape";
1952 if (ucnhash_CAPI == NULL) {
1953 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001954 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001955 m = PyImport_ImportModule("unicodedata");
1956 if (m == NULL)
1957 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001958 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001959 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001960 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001961 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001962 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001963 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001964 if (ucnhash_CAPI == NULL)
1965 goto ucnhashError;
1966 }
1967 if (*s == '{') {
1968 const char *start = s+1;
1969 /* look for the closing brace */
1970 while (*s != '}' && s < end)
1971 s++;
1972 if (s > start && s < end && *s == '}') {
1973 /* found a name. look it up in the unicode database */
1974 message = "unknown Unicode character name";
1975 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001976 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001977 goto store;
1978 }
1979 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001980 endinpos = s-starts;
1981 outpos = p-PyUnicode_AS_UNICODE(v);
1982 if (unicode_decode_call_errorhandler(
1983 errors, &errorHandler,
1984 "unicodeescape", message,
1985 starts, size, &startinpos, &endinpos, &exc, &s,
1986 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001987 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001988 break;
1989
1990 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001991 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001992 message = "\\ at end of string";
1993 s--;
1994 endinpos = s-starts;
1995 outpos = p-PyUnicode_AS_UNICODE(v);
1996 if (unicode_decode_call_errorhandler(
1997 errors, &errorHandler,
1998 "unicodeescape", message,
1999 starts, size, &startinpos, &endinpos, &exc, &s,
2000 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002001 goto onError;
2002 }
2003 else {
2004 *p++ = '\\';
2005 *p++ = (unsigned char)s[-1];
2006 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002007 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002009 nextByte:
2010 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002012 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002013 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002014 Py_XDECREF(errorHandler);
2015 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002017
Fredrik Lundhccc74732001-02-18 22:13:49 +00002018ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002019 PyErr_SetString(
2020 PyExc_UnicodeError,
2021 "\\N escapes not supported (can't load unicodedata module)"
2022 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002023 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002024 Py_XDECREF(errorHandler);
2025 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002026 return NULL;
2027
Fredrik Lundhccc74732001-02-18 22:13:49 +00002028onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002030 Py_XDECREF(errorHandler);
2031 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032 return NULL;
2033}
2034
2035/* Return a Unicode-Escape string version of the Unicode object.
2036
2037 If quotes is true, the string is enclosed in u"" or u'' quotes as
2038 appropriate.
2039
2040*/
2041
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002042Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002043 Py_ssize_t size,
2044 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002045{
2046 /* like wcschr, but doesn't stop at NULL characters */
2047
2048 while (size-- > 0) {
2049 if (*s == ch)
2050 return s;
2051 s++;
2052 }
2053
2054 return NULL;
2055}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002056
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057static
2058PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002059 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 int quotes)
2061{
2062 PyObject *repr;
2063 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002065 static const char *hexdigit = "0123456789abcdef";
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00002066#ifdef Py_UNICODE_WIDE
2067 const Py_ssize_t expandsize = 10;
2068#else
2069 const Py_ssize_t expandsize = 6;
2070#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002072 /* Initial allocation is based on the longest-possible unichr
2073 escape.
2074
2075 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2076 unichr, so in this case it's the longest unichr escape. In
2077 narrow (UTF-16) builds this is five chars per source unichr
2078 since there are two unichrs in the surrogate pair, so in narrow
2079 (UTF-16) builds it's not the longest unichr escape.
2080
2081 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2082 so in the narrow (UTF-16) build case it's the longest unichr
2083 escape.
2084 */
2085
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00002086 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
2087 return PyErr_NoMemory();
2088
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002089 repr = PyString_FromStringAndSize(NULL,
2090 2
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00002091 + expandsize*size
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002092 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 if (repr == NULL)
2094 return NULL;
2095
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002096 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097
2098 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002100 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101 !findchar(s, size, '"')) ? '"' : '\'';
2102 }
2103 while (size-- > 0) {
2104 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002105
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002106 /* Escape quotes and backslashes */
2107 if ((quotes &&
2108 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109 *p++ = '\\';
2110 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002111 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002112 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002113
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002114#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002115 /* Map 21-bit characters to '\U00xxxxxx' */
2116 else if (ch >= 0x10000) {
2117 *p++ = '\\';
2118 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002119 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2120 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2121 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2122 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2123 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2124 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2125 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002126 *p++ = hexdigit[ch & 0x0000000F];
2127 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002128 }
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002129#else
2130 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002131 else if (ch >= 0xD800 && ch < 0xDC00) {
2132 Py_UNICODE ch2;
2133 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002134
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002135 ch2 = *s++;
2136 size--;
2137 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2138 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2139 *p++ = '\\';
2140 *p++ = 'U';
2141 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2142 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2143 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2144 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2145 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2146 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2147 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2148 *p++ = hexdigit[ucs & 0x0000000F];
2149 continue;
2150 }
2151 /* Fall through: isolated surrogates are copied as-is */
2152 s--;
2153 size++;
2154 }
Neal Norwitz19c35bb2006-08-21 22:13:11 +00002155#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002156
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002158 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 *p++ = '\\';
2160 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002161 *p++ = hexdigit[(ch >> 12) & 0x000F];
2162 *p++ = hexdigit[(ch >> 8) & 0x000F];
2163 *p++ = hexdigit[(ch >> 4) & 0x000F];
2164 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002166
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002167 /* Map special whitespace to '\t', \n', '\r' */
2168 else if (ch == '\t') {
2169 *p++ = '\\';
2170 *p++ = 't';
2171 }
2172 else if (ch == '\n') {
2173 *p++ = '\\';
2174 *p++ = 'n';
2175 }
2176 else if (ch == '\r') {
2177 *p++ = '\\';
2178 *p++ = 'r';
2179 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002180
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002181 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002182 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002184 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002185 *p++ = hexdigit[(ch >> 4) & 0x000F];
2186 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002187 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002188
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189 /* Copy everything else as-is */
2190 else
2191 *p++ = (char) ch;
2192 }
2193 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002194 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195
2196 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002197 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198 return repr;
2199}
2200
2201PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002202 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203{
2204 return unicodeescape_string(s, size, 0);
2205}
2206
2207PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2208{
2209 if (!PyUnicode_Check(unicode)) {
2210 PyErr_BadArgument();
2211 return NULL;
2212 }
2213 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2214 PyUnicode_GET_SIZE(unicode));
2215}
2216
2217/* --- Raw Unicode Escape Codec ------------------------------------------- */
2218
2219PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002220 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 const char *errors)
2222{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002223 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002224 Py_ssize_t startinpos;
2225 Py_ssize_t endinpos;
2226 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002228 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229 const char *end;
2230 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002231 PyObject *errorHandler = NULL;
2232 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002233
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 /* Escaped strings will always be longer than the resulting
2235 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002236 length after conversion to the true value. (But decoding error
2237 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238 v = _PyUnicode_New(size);
2239 if (v == NULL)
2240 goto onError;
2241 if (size == 0)
2242 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002243 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244 end = s + size;
2245 while (s < end) {
2246 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002247 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002249 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250
2251 /* Non-escape characters are interpreted as Unicode ordinals */
2252 if (*s != '\\') {
2253 *p++ = (unsigned char)*s++;
2254 continue;
2255 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002256 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257
2258 /* \u-escapes are only interpreted iff the number of leading
2259 backslashes if odd */
2260 bs = s;
2261 for (;s < end;) {
2262 if (*s != '\\')
2263 break;
2264 *p++ = (unsigned char)*s++;
2265 }
2266 if (((s - bs) & 1) == 0 ||
2267 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002268 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 continue;
2270 }
2271 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002272 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 s++;
2274
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002275 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002276 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002277 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002280 endinpos = s-starts;
2281 if (unicode_decode_call_errorhandler(
2282 errors, &errorHandler,
2283 "rawunicodeescape", "truncated \\uXXXX",
2284 starts, size, &startinpos, &endinpos, &exc, &s,
2285 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002287 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 }
2289 x = (x<<4) & ~0xF;
2290 if (c >= '0' && c <= '9')
2291 x += c - '0';
2292 else if (c >= 'a' && c <= 'f')
2293 x += 10 + c - 'a';
2294 else
2295 x += 10 + c - 'A';
2296 }
Amaury Forgeot d'Arca79e0502008-03-24 21:16:28 +00002297 if (x <= 0xffff)
2298 /* UCS-2 character */
2299 *p++ = (Py_UNICODE) x;
2300 else if (x <= 0x10ffff) {
2301 /* UCS-4 character. Either store directly, or as
2302 surrogate pair. */
2303#ifdef Py_UNICODE_WIDE
2304 *p++ = (Py_UNICODE) x;
2305#else
2306 x -= 0x10000L;
2307 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
2308 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
2309#endif
2310 } else {
2311 endinpos = s-starts;
2312 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002313 if (unicode_decode_call_errorhandler(
2314 errors, &errorHandler,
2315 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2316 starts, size, &startinpos, &endinpos, &exc, &s,
2317 (PyObject **)&v, &outpos, &p))
2318 goto onError;
2319 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002320 nextByte:
2321 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002323 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002324 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002325 Py_XDECREF(errorHandler);
2326 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002328
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329 onError:
2330 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002331 Py_XDECREF(errorHandler);
2332 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002333 return NULL;
2334}
2335
2336PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002337 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002338{
2339 PyObject *repr;
2340 char *p;
2341 char *q;
2342
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002343 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002344#ifdef Py_UNICODE_WIDE
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00002345 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002346#else
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00002347 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002348#endif
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00002349
2350 if (size > PY_SSIZE_T_MAX / expandsize)
2351 return PyErr_NoMemory();
2352
2353 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002354 if (repr == NULL)
2355 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002356 if (size == 0)
2357 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358
2359 p = q = PyString_AS_STRING(repr);
2360 while (size-- > 0) {
2361 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002362#ifdef Py_UNICODE_WIDE
2363 /* Map 32-bit characters to '\Uxxxxxxxx' */
2364 if (ch >= 0x10000) {
2365 *p++ = '\\';
2366 *p++ = 'U';
2367 *p++ = hexdigit[(ch >> 28) & 0xf];
2368 *p++ = hexdigit[(ch >> 24) & 0xf];
2369 *p++ = hexdigit[(ch >> 20) & 0xf];
2370 *p++ = hexdigit[(ch >> 16) & 0xf];
2371 *p++ = hexdigit[(ch >> 12) & 0xf];
2372 *p++ = hexdigit[(ch >> 8) & 0xf];
2373 *p++ = hexdigit[(ch >> 4) & 0xf];
2374 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002375 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002376 else
Amaury Forgeot d'Arca79e0502008-03-24 21:16:28 +00002377#else
2378 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2379 if (ch >= 0xD800 && ch < 0xDC00) {
2380 Py_UNICODE ch2;
2381 Py_UCS4 ucs;
2382
2383 ch2 = *s++;
2384 size--;
2385 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2386 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2387 *p++ = '\\';
2388 *p++ = 'U';
2389 *p++ = hexdigit[(ucs >> 28) & 0xf];
2390 *p++ = hexdigit[(ucs >> 24) & 0xf];
2391 *p++ = hexdigit[(ucs >> 20) & 0xf];
2392 *p++ = hexdigit[(ucs >> 16) & 0xf];
2393 *p++ = hexdigit[(ucs >> 12) & 0xf];
2394 *p++ = hexdigit[(ucs >> 8) & 0xf];
2395 *p++ = hexdigit[(ucs >> 4) & 0xf];
2396 *p++ = hexdigit[ucs & 0xf];
2397 continue;
2398 }
2399 /* Fall through: isolated surrogates are copied as-is */
2400 s--;
2401 size++;
2402 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002403#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002404 /* Map 16-bit characters to '\uxxxx' */
2405 if (ch >= 256) {
2406 *p++ = '\\';
2407 *p++ = 'u';
2408 *p++ = hexdigit[(ch >> 12) & 0xf];
2409 *p++ = hexdigit[(ch >> 8) & 0xf];
2410 *p++ = hexdigit[(ch >> 4) & 0xf];
2411 *p++ = hexdigit[ch & 15];
2412 }
2413 /* Copy everything else as-is */
2414 else
2415 *p++ = (char) ch;
2416 }
2417 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002418 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002419 return repr;
2420}
2421
2422PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2423{
2424 if (!PyUnicode_Check(unicode)) {
2425 PyErr_BadArgument();
2426 return NULL;
2427 }
2428 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2429 PyUnicode_GET_SIZE(unicode));
2430}
2431
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002432/* --- Unicode Internal Codec ------------------------------------------- */
2433
2434PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002435 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002436 const char *errors)
2437{
2438 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002439 Py_ssize_t startinpos;
2440 Py_ssize_t endinpos;
2441 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002442 PyUnicodeObject *v;
2443 Py_UNICODE *p;
2444 const char *end;
2445 const char *reason;
2446 PyObject *errorHandler = NULL;
2447 PyObject *exc = NULL;
2448
Neal Norwitzd43069c2006-01-08 01:12:10 +00002449#ifdef Py_UNICODE_WIDE
2450 Py_UNICODE unimax = PyUnicode_GetMax();
2451#endif
2452
Armin Rigo4b63c212006-10-04 11:44:06 +00002453 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002454 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2455 if (v == NULL)
2456 goto onError;
2457 if (PyUnicode_GetSize((PyObject *)v) == 0)
2458 return (PyObject *)v;
2459 p = PyUnicode_AS_UNICODE(v);
2460 end = s + size;
2461
2462 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002463 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002464 /* We have to sanity check the raw data, otherwise doom looms for
2465 some malformed UCS-4 data. */
2466 if (
2467 #ifdef Py_UNICODE_WIDE
2468 *p > unimax || *p < 0 ||
2469 #endif
2470 end-s < Py_UNICODE_SIZE
2471 )
2472 {
2473 startinpos = s - starts;
2474 if (end-s < Py_UNICODE_SIZE) {
2475 endinpos = end-starts;
2476 reason = "truncated input";
2477 }
2478 else {
2479 endinpos = s - starts + Py_UNICODE_SIZE;
2480 reason = "illegal code point (> 0x10FFFF)";
2481 }
2482 outpos = p - PyUnicode_AS_UNICODE(v);
2483 if (unicode_decode_call_errorhandler(
2484 errors, &errorHandler,
2485 "unicode_internal", reason,
2486 starts, size, &startinpos, &endinpos, &exc, &s,
2487 (PyObject **)&v, &outpos, &p)) {
2488 goto onError;
2489 }
2490 }
2491 else {
2492 p++;
2493 s += Py_UNICODE_SIZE;
2494 }
2495 }
2496
Martin v. Löwis412fb672006-04-13 06:34:32 +00002497 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002498 goto onError;
2499 Py_XDECREF(errorHandler);
2500 Py_XDECREF(exc);
2501 return (PyObject *)v;
2502
2503 onError:
2504 Py_XDECREF(v);
2505 Py_XDECREF(errorHandler);
2506 Py_XDECREF(exc);
2507 return NULL;
2508}
2509
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510/* --- Latin-1 Codec ------------------------------------------------------ */
2511
2512PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002513 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514 const char *errors)
2515{
2516 PyUnicodeObject *v;
2517 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002518
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002520 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002521 Py_UNICODE r = *(unsigned char*)s;
2522 return PyUnicode_FromUnicode(&r, 1);
2523 }
2524
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 v = _PyUnicode_New(size);
2526 if (v == NULL)
2527 goto onError;
2528 if (size == 0)
2529 return (PyObject *)v;
2530 p = PyUnicode_AS_UNICODE(v);
2531 while (size-- > 0)
2532 *p++ = (unsigned char)*s++;
2533 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002534
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 onError:
2536 Py_XDECREF(v);
2537 return NULL;
2538}
2539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002540/* create or adjust a UnicodeEncodeError */
2541static void make_encode_exception(PyObject **exceptionObject,
2542 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002543 const Py_UNICODE *unicode, Py_ssize_t size,
2544 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002545 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002547 if (*exceptionObject == NULL) {
2548 *exceptionObject = PyUnicodeEncodeError_Create(
2549 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 }
2551 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002552 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2553 goto onError;
2554 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2555 goto onError;
2556 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2557 goto onError;
2558 return;
2559 onError:
2560 Py_DECREF(*exceptionObject);
2561 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562 }
2563}
2564
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002565/* raises a UnicodeEncodeError */
2566static void raise_encode_exception(PyObject **exceptionObject,
2567 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002568 const Py_UNICODE *unicode, Py_ssize_t size,
2569 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002570 const char *reason)
2571{
2572 make_encode_exception(exceptionObject,
2573 encoding, unicode, size, startpos, endpos, reason);
2574 if (*exceptionObject != NULL)
2575 PyCodec_StrictErrors(*exceptionObject);
2576}
2577
2578/* error handling callback helper:
2579 build arguments, call the callback and check the arguments,
2580 put the result into newpos and return the replacement string, which
2581 has to be freed by the caller */
2582static PyObject *unicode_encode_call_errorhandler(const char *errors,
2583 PyObject **errorHandler,
2584 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002585 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2586 Py_ssize_t startpos, Py_ssize_t endpos,
2587 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002588{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002589 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002590
2591 PyObject *restuple;
2592 PyObject *resunicode;
2593
2594 if (*errorHandler == NULL) {
2595 *errorHandler = PyCodec_LookupError(errors);
2596 if (*errorHandler == NULL)
2597 return NULL;
2598 }
2599
2600 make_encode_exception(exceptionObject,
2601 encoding, unicode, size, startpos, endpos, reason);
2602 if (*exceptionObject == NULL)
2603 return NULL;
2604
2605 restuple = PyObject_CallFunctionObjArgs(
2606 *errorHandler, *exceptionObject, NULL);
2607 if (restuple == NULL)
2608 return NULL;
2609 if (!PyTuple_Check(restuple)) {
2610 PyErr_Format(PyExc_TypeError, &argparse[4]);
2611 Py_DECREF(restuple);
2612 return NULL;
2613 }
2614 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2615 &resunicode, newpos)) {
2616 Py_DECREF(restuple);
2617 return NULL;
2618 }
2619 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002620 *newpos = size+*newpos;
2621 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002622 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002623 Py_DECREF(restuple);
2624 return NULL;
2625 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002626 Py_INCREF(resunicode);
2627 Py_DECREF(restuple);
2628 return resunicode;
2629}
2630
2631static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002632 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002633 const char *errors,
2634 int limit)
2635{
2636 /* output object */
2637 PyObject *res;
2638 /* pointers to the beginning and end+1 of input */
2639 const Py_UNICODE *startp = p;
2640 const Py_UNICODE *endp = p + size;
2641 /* pointer to the beginning of the unencodable characters */
2642 /* const Py_UNICODE *badp = NULL; */
2643 /* pointer into the output */
2644 char *str;
2645 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002646 Py_ssize_t respos = 0;
2647 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002648 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2649 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002650 PyObject *errorHandler = NULL;
2651 PyObject *exc = NULL;
2652 /* the following variable is used for caching string comparisons
2653 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2654 int known_errorHandler = -1;
2655
2656 /* allocate enough for a simple encoding without
2657 replacements, if we need more, we'll resize */
2658 res = PyString_FromStringAndSize(NULL, size);
2659 if (res == NULL)
2660 goto onError;
2661 if (size == 0)
2662 return res;
2663 str = PyString_AS_STRING(res);
2664 ressize = size;
2665
2666 while (p<endp) {
2667 Py_UNICODE c = *p;
2668
2669 /* can we encode this? */
2670 if (c<limit) {
2671 /* no overflow check, because we know that the space is enough */
2672 *str++ = (char)c;
2673 ++p;
2674 }
2675 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002676 Py_ssize_t unicodepos = p-startp;
2677 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002678 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002679 Py_ssize_t repsize;
2680 Py_ssize_t newpos;
2681 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002682 Py_UNICODE *uni2;
2683 /* startpos for collecting unencodable chars */
2684 const Py_UNICODE *collstart = p;
2685 const Py_UNICODE *collend = p;
2686 /* find all unecodable characters */
2687 while ((collend < endp) && ((*collend)>=limit))
2688 ++collend;
2689 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2690 if (known_errorHandler==-1) {
2691 if ((errors==NULL) || (!strcmp(errors, "strict")))
2692 known_errorHandler = 1;
2693 else if (!strcmp(errors, "replace"))
2694 known_errorHandler = 2;
2695 else if (!strcmp(errors, "ignore"))
2696 known_errorHandler = 3;
2697 else if (!strcmp(errors, "xmlcharrefreplace"))
2698 known_errorHandler = 4;
2699 else
2700 known_errorHandler = 0;
2701 }
2702 switch (known_errorHandler) {
2703 case 1: /* strict */
2704 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2705 goto onError;
2706 case 2: /* replace */
2707 while (collstart++<collend)
2708 *str++ = '?'; /* fall through */
2709 case 3: /* ignore */
2710 p = collend;
2711 break;
2712 case 4: /* xmlcharrefreplace */
2713 respos = str-PyString_AS_STRING(res);
2714 /* determine replacement size (temporarily (mis)uses p) */
2715 for (p = collstart, repsize = 0; p < collend; ++p) {
2716 if (*p<10)
2717 repsize += 2+1+1;
2718 else if (*p<100)
2719 repsize += 2+2+1;
2720 else if (*p<1000)
2721 repsize += 2+3+1;
2722 else if (*p<10000)
2723 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002724#ifndef Py_UNICODE_WIDE
2725 else
2726 repsize += 2+5+1;
2727#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 else if (*p<100000)
2729 repsize += 2+5+1;
2730 else if (*p<1000000)
2731 repsize += 2+6+1;
2732 else
2733 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002734#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002735 }
2736 requiredsize = respos+repsize+(endp-collend);
2737 if (requiredsize > ressize) {
2738 if (requiredsize<2*ressize)
2739 requiredsize = 2*ressize;
2740 if (_PyString_Resize(&res, requiredsize))
2741 goto onError;
2742 str = PyString_AS_STRING(res) + respos;
2743 ressize = requiredsize;
2744 }
2745 /* generate replacement (temporarily (mis)uses p) */
2746 for (p = collstart; p < collend; ++p) {
2747 str += sprintf(str, "&#%d;", (int)*p);
2748 }
2749 p = collend;
2750 break;
2751 default:
2752 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2753 encoding, reason, startp, size, &exc,
2754 collstart-startp, collend-startp, &newpos);
2755 if (repunicode == NULL)
2756 goto onError;
2757 /* need more space? (at least enough for what we
2758 have+the replacement+the rest of the string, so
2759 we won't have to check space for encodable characters) */
2760 respos = str-PyString_AS_STRING(res);
2761 repsize = PyUnicode_GET_SIZE(repunicode);
2762 requiredsize = respos+repsize+(endp-collend);
2763 if (requiredsize > ressize) {
2764 if (requiredsize<2*ressize)
2765 requiredsize = 2*ressize;
2766 if (_PyString_Resize(&res, requiredsize)) {
2767 Py_DECREF(repunicode);
2768 goto onError;
2769 }
2770 str = PyString_AS_STRING(res) + respos;
2771 ressize = requiredsize;
2772 }
2773 /* check if there is anything unencodable in the replacement
2774 and copy it to the output */
2775 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2776 c = *uni2;
2777 if (c >= limit) {
2778 raise_encode_exception(&exc, encoding, startp, size,
2779 unicodepos, unicodepos+1, reason);
2780 Py_DECREF(repunicode);
2781 goto onError;
2782 }
2783 *str = (char)c;
2784 }
2785 p = startp + newpos;
2786 Py_DECREF(repunicode);
2787 }
2788 }
2789 }
2790 /* Resize if we allocated to much */
2791 respos = str-PyString_AS_STRING(res);
2792 if (respos<ressize)
2793 /* If this falls res will be NULL */
2794 _PyString_Resize(&res, respos);
2795 Py_XDECREF(errorHandler);
2796 Py_XDECREF(exc);
2797 return res;
2798
2799 onError:
2800 Py_XDECREF(res);
2801 Py_XDECREF(errorHandler);
2802 Py_XDECREF(exc);
2803 return NULL;
2804}
2805
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002807 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 const char *errors)
2809{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002810 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811}
2812
2813PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2814{
2815 if (!PyUnicode_Check(unicode)) {
2816 PyErr_BadArgument();
2817 return NULL;
2818 }
2819 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2820 PyUnicode_GET_SIZE(unicode),
2821 NULL);
2822}
2823
2824/* --- 7-bit ASCII Codec -------------------------------------------------- */
2825
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002827 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828 const char *errors)
2829{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002830 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 PyUnicodeObject *v;
2832 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002833 Py_ssize_t startinpos;
2834 Py_ssize_t endinpos;
2835 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002836 const char *e;
2837 PyObject *errorHandler = NULL;
2838 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002839
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002841 if (size == 1 && *(unsigned char*)s < 128) {
2842 Py_UNICODE r = *(unsigned char*)s;
2843 return PyUnicode_FromUnicode(&r, 1);
2844 }
Tim Petersced69f82003-09-16 20:30:58 +00002845
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846 v = _PyUnicode_New(size);
2847 if (v == NULL)
2848 goto onError;
2849 if (size == 0)
2850 return (PyObject *)v;
2851 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002852 e = s + size;
2853 while (s < e) {
2854 register unsigned char c = (unsigned char)*s;
2855 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 ++s;
2858 }
2859 else {
2860 startinpos = s-starts;
2861 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002862 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002863 if (unicode_decode_call_errorhandler(
2864 errors, &errorHandler,
2865 "ascii", "ordinal not in range(128)",
2866 starts, size, &startinpos, &endinpos, &exc, &s,
2867 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002869 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002871 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002872 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002873 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002874 Py_XDECREF(errorHandler);
2875 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002877
Guido van Rossumd57fd912000-03-10 22:53:23 +00002878 onError:
2879 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002880 Py_XDECREF(errorHandler);
2881 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882 return NULL;
2883}
2884
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002886 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 const char *errors)
2888{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002889 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002890}
2891
2892PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2893{
2894 if (!PyUnicode_Check(unicode)) {
2895 PyErr_BadArgument();
2896 return NULL;
2897 }
2898 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2899 PyUnicode_GET_SIZE(unicode),
2900 NULL);
2901}
2902
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002903#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002904
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002905/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002906
Martin v. Löwisd8251432006-06-14 05:21:04 +00002907#if SIZEOF_INT < SIZEOF_SSIZE_T
2908#define NEED_RETRY
2909#endif
2910
2911/* XXX This code is limited to "true" double-byte encodings, as
2912 a) it assumes an incomplete character consists of a single byte, and
2913 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2914 encodings, see IsDBCSLeadByteEx documentation. */
2915
2916static int is_dbcs_lead_byte(const char *s, int offset)
2917{
2918 const char *curr = s + offset;
2919
2920 if (IsDBCSLeadByte(*curr)) {
2921 const char *prev = CharPrev(s, curr);
2922 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2923 }
2924 return 0;
2925}
2926
2927/*
2928 * Decode MBCS string into unicode object. If 'final' is set, converts
2929 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2930 */
2931static int decode_mbcs(PyUnicodeObject **v,
2932 const char *s, /* MBCS string */
2933 int size, /* sizeof MBCS string */
2934 int final)
2935{
2936 Py_UNICODE *p;
2937 Py_ssize_t n = 0;
2938 int usize = 0;
2939
2940 assert(size >= 0);
2941
2942 /* Skip trailing lead-byte unless 'final' is set */
2943 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2944 --size;
2945
2946 /* First get the size of the result */
2947 if (size > 0) {
2948 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2949 if (usize == 0) {
2950 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2951 return -1;
2952 }
2953 }
2954
2955 if (*v == NULL) {
2956 /* Create unicode object */
2957 *v = _PyUnicode_New(usize);
2958 if (*v == NULL)
2959 return -1;
2960 }
2961 else {
2962 /* Extend unicode object */
2963 n = PyUnicode_GET_SIZE(*v);
2964 if (_PyUnicode_Resize(v, n + usize) < 0)
2965 return -1;
2966 }
2967
2968 /* Do the conversion */
2969 if (size > 0) {
2970 p = PyUnicode_AS_UNICODE(*v) + n;
2971 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2972 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2973 return -1;
2974 }
2975 }
2976
2977 return size;
2978}
2979
2980PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2981 Py_ssize_t size,
2982 const char *errors,
2983 Py_ssize_t *consumed)
2984{
2985 PyUnicodeObject *v = NULL;
2986 int done;
2987
2988 if (consumed)
2989 *consumed = 0;
2990
2991#ifdef NEED_RETRY
2992 retry:
2993 if (size > INT_MAX)
2994 done = decode_mbcs(&v, s, INT_MAX, 0);
2995 else
2996#endif
2997 done = decode_mbcs(&v, s, (int)size, !consumed);
2998
2999 if (done < 0) {
3000 Py_XDECREF(v);
3001 return NULL;
3002 }
3003
3004 if (consumed)
3005 *consumed += done;
3006
3007#ifdef NEED_RETRY
3008 if (size > INT_MAX) {
3009 s += done;
3010 size -= done;
3011 goto retry;
3012 }
3013#endif
3014
3015 return (PyObject *)v;
3016}
3017
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003018PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003019 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003020 const char *errors)
3021{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003022 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3023}
3024
3025/*
3026 * Convert unicode into string object (MBCS).
3027 * Returns 0 if succeed, -1 otherwise.
3028 */
3029static int encode_mbcs(PyObject **repr,
3030 const Py_UNICODE *p, /* unicode */
3031 int size) /* size of unicode */
3032{
3033 int mbcssize = 0;
3034 Py_ssize_t n = 0;
3035
3036 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003037
3038 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003039 if (size > 0) {
3040 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3041 if (mbcssize == 0) {
3042 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3043 return -1;
3044 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003045 }
3046
Martin v. Löwisd8251432006-06-14 05:21:04 +00003047 if (*repr == NULL) {
3048 /* Create string object */
3049 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3050 if (*repr == NULL)
3051 return -1;
3052 }
3053 else {
3054 /* Extend string object */
3055 n = PyString_Size(*repr);
3056 if (_PyString_Resize(repr, n + mbcssize) < 0)
3057 return -1;
3058 }
3059
3060 /* Do the conversion */
3061 if (size > 0) {
3062 char *s = PyString_AS_STRING(*repr) + n;
3063 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3064 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3065 return -1;
3066 }
3067 }
3068
3069 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003070}
3071
3072PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003073 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003074 const char *errors)
3075{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003076 PyObject *repr = NULL;
3077 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003078
Martin v. Löwisd8251432006-06-14 05:21:04 +00003079#ifdef NEED_RETRY
3080 retry:
3081 if (size > INT_MAX)
3082 ret = encode_mbcs(&repr, p, INT_MAX);
3083 else
3084#endif
3085 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003086
Martin v. Löwisd8251432006-06-14 05:21:04 +00003087 if (ret < 0) {
3088 Py_XDECREF(repr);
3089 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003090 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003091
3092#ifdef NEED_RETRY
3093 if (size > INT_MAX) {
3094 p += INT_MAX;
3095 size -= INT_MAX;
3096 goto retry;
3097 }
3098#endif
3099
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003100 return repr;
3101}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003102
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003103PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3104{
3105 if (!PyUnicode_Check(unicode)) {
3106 PyErr_BadArgument();
3107 return NULL;
3108 }
3109 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3110 PyUnicode_GET_SIZE(unicode),
3111 NULL);
3112}
3113
Martin v. Löwisd8251432006-06-14 05:21:04 +00003114#undef NEED_RETRY
3115
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003116#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003117
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118/* --- Character Mapping Codec -------------------------------------------- */
3119
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003121 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 PyObject *mapping,
3123 const char *errors)
3124{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003125 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003126 Py_ssize_t startinpos;
3127 Py_ssize_t endinpos;
3128 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003129 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 PyUnicodeObject *v;
3131 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003132 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003133 PyObject *errorHandler = NULL;
3134 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003135 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003136 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003137
Guido van Rossumd57fd912000-03-10 22:53:23 +00003138 /* Default to Latin-1 */
3139 if (mapping == NULL)
3140 return PyUnicode_DecodeLatin1(s, size, errors);
3141
3142 v = _PyUnicode_New(size);
3143 if (v == NULL)
3144 goto onError;
3145 if (size == 0)
3146 return (PyObject *)v;
3147 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003148 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003149 if (PyUnicode_CheckExact(mapping)) {
3150 mapstring = PyUnicode_AS_UNICODE(mapping);
3151 maplen = PyUnicode_GET_SIZE(mapping);
3152 while (s < e) {
3153 unsigned char ch = *s;
3154 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003156 if (ch < maplen)
3157 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003159 if (x == 0xfffe) {
3160 /* undefined mapping */
3161 outpos = p-PyUnicode_AS_UNICODE(v);
3162 startinpos = s-starts;
3163 endinpos = startinpos+1;
3164 if (unicode_decode_call_errorhandler(
3165 errors, &errorHandler,
3166 "charmap", "character maps to <undefined>",
3167 starts, size, &startinpos, &endinpos, &exc, &s,
3168 (PyObject **)&v, &outpos, &p)) {
3169 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003170 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003171 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003172 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003173 *p++ = x;
3174 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003176 }
3177 else {
3178 while (s < e) {
3179 unsigned char ch = *s;
3180 PyObject *w, *x;
3181
3182 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3183 w = PyInt_FromLong((long)ch);
3184 if (w == NULL)
3185 goto onError;
3186 x = PyObject_GetItem(mapping, w);
3187 Py_DECREF(w);
3188 if (x == NULL) {
3189 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3190 /* No mapping found means: mapping is undefined. */
3191 PyErr_Clear();
3192 x = Py_None;
3193 Py_INCREF(x);
3194 } else
3195 goto onError;
3196 }
3197
3198 /* Apply mapping */
3199 if (PyInt_Check(x)) {
3200 long value = PyInt_AS_LONG(x);
3201 if (value < 0 || value > 65535) {
3202 PyErr_SetString(PyExc_TypeError,
3203 "character mapping must be in range(65536)");
3204 Py_DECREF(x);
3205 goto onError;
3206 }
3207 *p++ = (Py_UNICODE)value;
3208 }
3209 else if (x == Py_None) {
3210 /* undefined mapping */
3211 outpos = p-PyUnicode_AS_UNICODE(v);
3212 startinpos = s-starts;
3213 endinpos = startinpos+1;
3214 if (unicode_decode_call_errorhandler(
3215 errors, &errorHandler,
3216 "charmap", "character maps to <undefined>",
3217 starts, size, &startinpos, &endinpos, &exc, &s,
3218 (PyObject **)&v, &outpos, &p)) {
3219 Py_DECREF(x);
3220 goto onError;
3221 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003222 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003223 continue;
3224 }
3225 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003226 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003227
3228 if (targetsize == 1)
3229 /* 1-1 mapping */
3230 *p++ = *PyUnicode_AS_UNICODE(x);
3231
3232 else if (targetsize > 1) {
3233 /* 1-n mapping */
3234 if (targetsize > extrachars) {
3235 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003236 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3237 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003238 (targetsize << 2);
3239 extrachars += needed;
Armin Rigo4b63c212006-10-04 11:44:06 +00003240 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003241 if (_PyUnicode_Resize(&v,
3242 PyUnicode_GET_SIZE(v) + needed) < 0) {
3243 Py_DECREF(x);
3244 goto onError;
3245 }
3246 p = PyUnicode_AS_UNICODE(v) + oldpos;
3247 }
3248 Py_UNICODE_COPY(p,
3249 PyUnicode_AS_UNICODE(x),
3250 targetsize);
3251 p += targetsize;
3252 extrachars -= targetsize;
3253 }
3254 /* 1-0 mapping: skip the character */
3255 }
3256 else {
3257 /* wrong return value */
3258 PyErr_SetString(PyExc_TypeError,
3259 "character mapping must return integer, None or unicode");
3260 Py_DECREF(x);
3261 goto onError;
3262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003264 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 }
3267 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003268 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003270 Py_XDECREF(errorHandler);
3271 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003273
Guido van Rossumd57fd912000-03-10 22:53:23 +00003274 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275 Py_XDECREF(errorHandler);
3276 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277 Py_XDECREF(v);
3278 return NULL;
3279}
3280
Martin v. Löwis3f767792006-06-04 19:36:28 +00003281/* Charmap encoding: the lookup table */
3282
3283struct encoding_map{
3284 PyObject_HEAD
3285 unsigned char level1[32];
3286 int count2, count3;
3287 unsigned char level23[1];
3288};
3289
3290static PyObject*
3291encoding_map_size(PyObject *obj, PyObject* args)
3292{
3293 struct encoding_map *map = (struct encoding_map*)obj;
3294 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3295 128*map->count3);
3296}
3297
3298static PyMethodDef encoding_map_methods[] = {
3299 {"size", encoding_map_size, METH_NOARGS,
3300 PyDoc_STR("Return the size (in bytes) of this object") },
3301 { 0 }
3302};
3303
3304static void
3305encoding_map_dealloc(PyObject* o)
3306{
3307 PyObject_FREE(o);
3308}
3309
3310static PyTypeObject EncodingMapType = {
3311 PyObject_HEAD_INIT(NULL)
3312 0, /*ob_size*/
3313 "EncodingMap", /*tp_name*/
3314 sizeof(struct encoding_map), /*tp_basicsize*/
3315 0, /*tp_itemsize*/
3316 /* methods */
3317 encoding_map_dealloc, /*tp_dealloc*/
3318 0, /*tp_print*/
3319 0, /*tp_getattr*/
3320 0, /*tp_setattr*/
3321 0, /*tp_compare*/
3322 0, /*tp_repr*/
3323 0, /*tp_as_number*/
3324 0, /*tp_as_sequence*/
3325 0, /*tp_as_mapping*/
3326 0, /*tp_hash*/
3327 0, /*tp_call*/
3328 0, /*tp_str*/
3329 0, /*tp_getattro*/
3330 0, /*tp_setattro*/
3331 0, /*tp_as_buffer*/
3332 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3333 0, /*tp_doc*/
3334 0, /*tp_traverse*/
3335 0, /*tp_clear*/
3336 0, /*tp_richcompare*/
3337 0, /*tp_weaklistoffset*/
3338 0, /*tp_iter*/
3339 0, /*tp_iternext*/
3340 encoding_map_methods, /*tp_methods*/
3341 0, /*tp_members*/
3342 0, /*tp_getset*/
3343 0, /*tp_base*/
3344 0, /*tp_dict*/
3345 0, /*tp_descr_get*/
3346 0, /*tp_descr_set*/
3347 0, /*tp_dictoffset*/
3348 0, /*tp_init*/
3349 0, /*tp_alloc*/
3350 0, /*tp_new*/
3351 0, /*tp_free*/
3352 0, /*tp_is_gc*/
3353};
3354
3355PyObject*
3356PyUnicode_BuildEncodingMap(PyObject* string)
3357{
3358 Py_UNICODE *decode;
3359 PyObject *result;
3360 struct encoding_map *mresult;
3361 int i;
3362 int need_dict = 0;
3363 unsigned char level1[32];
3364 unsigned char level2[512];
3365 unsigned char *mlevel1, *mlevel2, *mlevel3;
3366 int count2 = 0, count3 = 0;
3367
3368 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3369 PyErr_BadArgument();
3370 return NULL;
3371 }
3372 decode = PyUnicode_AS_UNICODE(string);
3373 memset(level1, 0xFF, sizeof level1);
3374 memset(level2, 0xFF, sizeof level2);
3375
3376 /* If there isn't a one-to-one mapping of NULL to \0,
3377 or if there are non-BMP characters, we need to use
3378 a mapping dictionary. */
3379 if (decode[0] != 0)
3380 need_dict = 1;
3381 for (i = 1; i < 256; i++) {
3382 int l1, l2;
3383 if (decode[i] == 0
3384 #ifdef Py_UNICODE_WIDE
3385 || decode[i] > 0xFFFF
3386 #endif
3387 ) {
3388 need_dict = 1;
3389 break;
3390 }
3391 if (decode[i] == 0xFFFE)
3392 /* unmapped character */
3393 continue;
3394 l1 = decode[i] >> 11;
3395 l2 = decode[i] >> 7;
3396 if (level1[l1] == 0xFF)
3397 level1[l1] = count2++;
3398 if (level2[l2] == 0xFF)
3399 level2[l2] = count3++;
3400 }
3401
3402 if (count2 >= 0xFF || count3 >= 0xFF)
3403 need_dict = 1;
3404
3405 if (need_dict) {
3406 PyObject *result = PyDict_New();
3407 PyObject *key, *value;
3408 if (!result)
3409 return NULL;
3410 for (i = 0; i < 256; i++) {
3411 key = value = NULL;
3412 key = PyInt_FromLong(decode[i]);
3413 value = PyInt_FromLong(i);
3414 if (!key || !value)
3415 goto failed1;
3416 if (PyDict_SetItem(result, key, value) == -1)
3417 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003418 Py_DECREF(key);
3419 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003420 }
3421 return result;
3422 failed1:
3423 Py_XDECREF(key);
3424 Py_XDECREF(value);
3425 Py_DECREF(result);
3426 return NULL;
3427 }
3428
3429 /* Create a three-level trie */
3430 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3431 16*count2 + 128*count3 - 1);
3432 if (!result)
3433 return PyErr_NoMemory();
3434 PyObject_Init(result, &EncodingMapType);
3435 mresult = (struct encoding_map*)result;
3436 mresult->count2 = count2;
3437 mresult->count3 = count3;
3438 mlevel1 = mresult->level1;
3439 mlevel2 = mresult->level23;
3440 mlevel3 = mresult->level23 + 16*count2;
3441 memcpy(mlevel1, level1, 32);
3442 memset(mlevel2, 0xFF, 16*count2);
3443 memset(mlevel3, 0, 128*count3);
3444 count3 = 0;
3445 for (i = 1; i < 256; i++) {
3446 int o1, o2, o3, i2, i3;
3447 if (decode[i] == 0xFFFE)
3448 /* unmapped character */
3449 continue;
3450 o1 = decode[i]>>11;
3451 o2 = (decode[i]>>7) & 0xF;
3452 i2 = 16*mlevel1[o1] + o2;
3453 if (mlevel2[i2] == 0xFF)
3454 mlevel2[i2] = count3++;
3455 o3 = decode[i] & 0x7F;
3456 i3 = 128*mlevel2[i2] + o3;
3457 mlevel3[i3] = i;
3458 }
3459 return result;
3460}
3461
3462static int
3463encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3464{
3465 struct encoding_map *map = (struct encoding_map*)mapping;
3466 int l1 = c>>11;
3467 int l2 = (c>>7) & 0xF;
3468 int l3 = c & 0x7F;
3469 int i;
3470
3471#ifdef Py_UNICODE_WIDE
3472 if (c > 0xFFFF) {
3473 return -1;
3474 }
3475#endif
3476 if (c == 0)
3477 return 0;
3478 /* level 1*/
3479 i = map->level1[l1];
3480 if (i == 0xFF) {
3481 return -1;
3482 }
3483 /* level 2*/
3484 i = map->level23[16*i+l2];
3485 if (i == 0xFF) {
3486 return -1;
3487 }
3488 /* level 3 */
3489 i = map->level23[16*map->count2 + 128*i + l3];
3490 if (i == 0) {
3491 return -1;
3492 }
3493 return i;
3494}
3495
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496/* Lookup the character ch in the mapping. If the character
3497 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003498 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003499static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 PyObject *w = PyInt_FromLong((long)c);
3502 PyObject *x;
3503
3504 if (w == NULL)
3505 return NULL;
3506 x = PyObject_GetItem(mapping, w);
3507 Py_DECREF(w);
3508 if (x == NULL) {
3509 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3510 /* No mapping found means: mapping is undefined. */
3511 PyErr_Clear();
3512 x = Py_None;
3513 Py_INCREF(x);
3514 return x;
3515 } else
3516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003517 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003518 else if (x == Py_None)
3519 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520 else if (PyInt_Check(x)) {
3521 long value = PyInt_AS_LONG(x);
3522 if (value < 0 || value > 255) {
3523 PyErr_SetString(PyExc_TypeError,
3524 "character mapping must be in range(256)");
3525 Py_DECREF(x);
3526 return NULL;
3527 }
3528 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 else if (PyString_Check(x))
3531 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 /* wrong return value */
3534 PyErr_SetString(PyExc_TypeError,
3535 "character mapping must return integer, None or str");
3536 Py_DECREF(x);
3537 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538 }
3539}
3540
Martin v. Löwis3f767792006-06-04 19:36:28 +00003541static int
3542charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3543{
3544 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3545 /* exponentially overallocate to minimize reallocations */
3546 if (requiredsize < 2*outsize)
3547 requiredsize = 2*outsize;
3548 if (_PyString_Resize(outobj, requiredsize)) {
3549 return 0;
3550 }
3551 return 1;
3552}
3553
3554typedef enum charmapencode_result {
3555 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3556}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557/* lookup the character, put the result in the output string and adjust
3558 various state variables. Reallocate the output string if not enough
3559 space is available. Return a new reference to the object that
3560 was put in the output buffer, or Py_None, if the mapping was undefined
3561 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003562 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003564charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003565 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003567 PyObject *rep;
3568 char *outstart;
3569 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570
Martin v. Löwis3f767792006-06-04 19:36:28 +00003571 if (mapping->ob_type == &EncodingMapType) {
3572 int res = encoding_map_lookup(c, mapping);
3573 Py_ssize_t requiredsize = *outpos+1;
3574 if (res == -1)
3575 return enc_FAILED;
3576 if (outsize<requiredsize)
3577 if (!charmapencode_resize(outobj, outpos, requiredsize))
3578 return enc_EXCEPTION;
3579 outstart = PyString_AS_STRING(*outobj);
3580 outstart[(*outpos)++] = (char)res;
3581 return enc_SUCCESS;
3582 }
3583
3584 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003586 return enc_EXCEPTION;
3587 else if (rep==Py_None) {
3588 Py_DECREF(rep);
3589 return enc_FAILED;
3590 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003592 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003593 if (outsize<requiredsize)
3594 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003595 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003596 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003598 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3600 }
3601 else {
3602 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003603 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3604 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003605 if (outsize<requiredsize)
3606 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003608 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003610 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 memcpy(outstart + *outpos, repchars, repsize);
3612 *outpos += repsize;
3613 }
3614 }
Georg Brandl9f167602006-06-04 21:46:16 +00003615 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003616 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617}
3618
3619/* handle an error in PyUnicode_EncodeCharmap
3620 Return 0 on success, -1 on error */
3621static
3622int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003623 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003625 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003626 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003627{
3628 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003629 Py_ssize_t repsize;
3630 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 Py_UNICODE *uni2;
3632 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003633 Py_ssize_t collstartpos = *inpos;
3634 Py_ssize_t collendpos = *inpos+1;
3635 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636 char *encoding = "charmap";
3637 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003638 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 /* find all unencodable characters */
3641 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003642 PyObject *rep;
3643 if (mapping->ob_type == &EncodingMapType) {
3644 int res = encoding_map_lookup(p[collendpos], mapping);
3645 if (res != -1)
3646 break;
3647 ++collendpos;
3648 continue;
3649 }
3650
3651 rep = charmapencode_lookup(p[collendpos], mapping);
3652 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003654 else if (rep!=Py_None) {
3655 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 break;
3657 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003658 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 ++collendpos;
3660 }
3661 /* cache callback name lookup
3662 * (if not done yet, i.e. it's the first error) */
3663 if (*known_errorHandler==-1) {
3664 if ((errors==NULL) || (!strcmp(errors, "strict")))
3665 *known_errorHandler = 1;
3666 else if (!strcmp(errors, "replace"))
3667 *known_errorHandler = 2;
3668 else if (!strcmp(errors, "ignore"))
3669 *known_errorHandler = 3;
3670 else if (!strcmp(errors, "xmlcharrefreplace"))
3671 *known_errorHandler = 4;
3672 else
3673 *known_errorHandler = 0;
3674 }
3675 switch (*known_errorHandler) {
3676 case 1: /* strict */
3677 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3678 return -1;
3679 case 2: /* replace */
3680 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3681 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003682 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683 return -1;
3684 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003685 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3687 return -1;
3688 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 }
3690 /* fall through */
3691 case 3: /* ignore */
3692 *inpos = collendpos;
3693 break;
3694 case 4: /* xmlcharrefreplace */
3695 /* generate replacement (temporarily (mis)uses p) */
3696 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3697 char buffer[2+29+1+1];
3698 char *cp;
3699 sprintf(buffer, "&#%d;", (int)p[collpos]);
3700 for (cp = buffer; *cp; ++cp) {
3701 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003702 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003703 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003704 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3706 return -1;
3707 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708 }
3709 }
3710 *inpos = collendpos;
3711 break;
3712 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003713 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003714 encoding, reason, p, size, exceptionObject,
3715 collstartpos, collendpos, &newpos);
3716 if (repunicode == NULL)
3717 return -1;
3718 /* generate replacement */
3719 repsize = PyUnicode_GET_SIZE(repunicode);
3720 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3721 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003722 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003723 return -1;
3724 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003725 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003727 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3728 return -1;
3729 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 }
3731 *inpos = newpos;
3732 Py_DECREF(repunicode);
3733 }
3734 return 0;
3735}
3736
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003738 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739 PyObject *mapping,
3740 const char *errors)
3741{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003742 /* output object */
3743 PyObject *res = NULL;
3744 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003745 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003746 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003747 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 PyObject *errorHandler = NULL;
3749 PyObject *exc = NULL;
3750 /* the following variable is used for caching string comparisons
3751 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3752 * 3=ignore, 4=xmlcharrefreplace */
3753 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754
3755 /* Default to Latin-1 */
3756 if (mapping == NULL)
3757 return PyUnicode_EncodeLatin1(p, size, errors);
3758
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 /* allocate enough for a simple encoding without
3760 replacements, if we need more, we'll resize */
3761 res = PyString_FromStringAndSize(NULL, size);
3762 if (res == NULL)
3763 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003764 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767 while (inpos<size) {
3768 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003769 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3770 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003772 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003773 if (charmap_encoding_error(p, size, &inpos, mapping,
3774 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003775 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003776 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003777 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003780 else
3781 /* done with this character => adjust input position */
3782 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003785 /* Resize if we allocated to much */
3786 if (respos<PyString_GET_SIZE(res)) {
3787 if (_PyString_Resize(&res, respos))
3788 goto onError;
3789 }
3790 Py_XDECREF(exc);
3791 Py_XDECREF(errorHandler);
3792 return res;
3793
3794 onError:
3795 Py_XDECREF(res);
3796 Py_XDECREF(exc);
3797 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798 return NULL;
3799}
3800
3801PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3802 PyObject *mapping)
3803{
3804 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3805 PyErr_BadArgument();
3806 return NULL;
3807 }
3808 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3809 PyUnicode_GET_SIZE(unicode),
3810 mapping,
3811 NULL);
3812}
3813
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003814/* create or adjust a UnicodeTranslateError */
3815static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003816 const Py_UNICODE *unicode, Py_ssize_t size,
3817 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003818 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 if (*exceptionObject == NULL) {
3821 *exceptionObject = PyUnicodeTranslateError_Create(
3822 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 }
3824 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003825 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3826 goto onError;
3827 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3828 goto onError;
3829 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3830 goto onError;
3831 return;
3832 onError:
3833 Py_DECREF(*exceptionObject);
3834 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 }
3836}
3837
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838/* raises a UnicodeTranslateError */
3839static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003840 const Py_UNICODE *unicode, Py_ssize_t size,
3841 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003842 const char *reason)
3843{
3844 make_translate_exception(exceptionObject,
3845 unicode, size, startpos, endpos, reason);
3846 if (*exceptionObject != NULL)
3847 PyCodec_StrictErrors(*exceptionObject);
3848}
3849
3850/* error handling callback helper:
3851 build arguments, call the callback and check the arguments,
3852 put the result into newpos and return the replacement string, which
3853 has to be freed by the caller */
3854static PyObject *unicode_translate_call_errorhandler(const char *errors,
3855 PyObject **errorHandler,
3856 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003857 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3858 Py_ssize_t startpos, Py_ssize_t endpos,
3859 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003861 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862
Martin v. Löwis412fb672006-04-13 06:34:32 +00003863 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864 PyObject *restuple;
3865 PyObject *resunicode;
3866
3867 if (*errorHandler == NULL) {
3868 *errorHandler = PyCodec_LookupError(errors);
3869 if (*errorHandler == NULL)
3870 return NULL;
3871 }
3872
3873 make_translate_exception(exceptionObject,
3874 unicode, size, startpos, endpos, reason);
3875 if (*exceptionObject == NULL)
3876 return NULL;
3877
3878 restuple = PyObject_CallFunctionObjArgs(
3879 *errorHandler, *exceptionObject, NULL);
3880 if (restuple == NULL)
3881 return NULL;
3882 if (!PyTuple_Check(restuple)) {
3883 PyErr_Format(PyExc_TypeError, &argparse[4]);
3884 Py_DECREF(restuple);
3885 return NULL;
3886 }
3887 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003888 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003889 Py_DECREF(restuple);
3890 return NULL;
3891 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003892 if (i_newpos<0)
3893 *newpos = size+i_newpos;
3894 else
3895 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003896 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003897 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003898 Py_DECREF(restuple);
3899 return NULL;
3900 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901 Py_INCREF(resunicode);
3902 Py_DECREF(restuple);
3903 return resunicode;
3904}
3905
3906/* Lookup the character ch in the mapping and put the result in result,
3907 which must be decrefed by the caller.
3908 Return 0 on success, -1 on error */
3909static
3910int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3911{
3912 PyObject *w = PyInt_FromLong((long)c);
3913 PyObject *x;
3914
3915 if (w == NULL)
3916 return -1;
3917 x = PyObject_GetItem(mapping, w);
3918 Py_DECREF(w);
3919 if (x == NULL) {
3920 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3921 /* No mapping found means: use 1:1 mapping. */
3922 PyErr_Clear();
3923 *result = NULL;
3924 return 0;
3925 } else
3926 return -1;
3927 }
3928 else if (x == Py_None) {
3929 *result = x;
3930 return 0;
3931 }
3932 else if (PyInt_Check(x)) {
3933 long value = PyInt_AS_LONG(x);
3934 long max = PyUnicode_GetMax();
3935 if (value < 0 || value > max) {
3936 PyErr_Format(PyExc_TypeError,
3937 "character mapping must be in range(0x%lx)", max+1);
3938 Py_DECREF(x);
3939 return -1;
3940 }
3941 *result = x;
3942 return 0;
3943 }
3944 else if (PyUnicode_Check(x)) {
3945 *result = x;
3946 return 0;
3947 }
3948 else {
3949 /* wrong return value */
3950 PyErr_SetString(PyExc_TypeError,
3951 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003952 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003953 return -1;
3954 }
3955}
3956/* ensure that *outobj is at least requiredsize characters long,
3957if not reallocate and adjust various state variables.
3958Return 0 on success, -1 on error */
3959static
Walter Dörwald4894c302003-10-24 14:25:28 +00003960int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003961 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003963 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003964 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003966 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003968 if (requiredsize < 2 * oldsize)
3969 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003970 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971 return -1;
3972 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973 }
3974 return 0;
3975}
3976/* lookup the character, put the result in the output string and adjust
3977 various state variables. Return a new reference to the object that
3978 was put in the output buffer in *result, or Py_None, if the mapping was
3979 undefined (in which case no character was written).
3980 The called must decref result.
3981 Return 0 on success, -1 on error. */
3982static
Walter Dörwald4894c302003-10-24 14:25:28 +00003983int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003984 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003985 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986{
Walter Dörwald4894c302003-10-24 14:25:28 +00003987 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 return -1;
3989 if (*res==NULL) {
3990 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003991 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 }
3993 else if (*res==Py_None)
3994 ;
3995 else if (PyInt_Check(*res)) {
3996 /* no overflow check, because we know that the space is enough */
3997 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3998 }
3999 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004000 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001 if (repsize==1) {
4002 /* no overflow check, because we know that the space is enough */
4003 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4004 }
4005 else if (repsize!=0) {
4006 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004007 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004008 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004009 repsize - 1;
4010 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 return -1;
4012 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4013 *outp += repsize;
4014 }
4015 }
4016 else
4017 return -1;
4018 return 0;
4019}
4020
4021PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004022 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023 PyObject *mapping,
4024 const char *errors)
4025{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 /* output object */
4027 PyObject *res = NULL;
4028 /* pointers to the beginning and end+1 of input */
4029 const Py_UNICODE *startp = p;
4030 const Py_UNICODE *endp = p + size;
4031 /* pointer into the output */
4032 Py_UNICODE *str;
4033 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004034 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004035 char *reason = "character maps to <undefined>";
4036 PyObject *errorHandler = NULL;
4037 PyObject *exc = NULL;
4038 /* the following variable is used for caching string comparisons
4039 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4040 * 3=ignore, 4=xmlcharrefreplace */
4041 int known_errorHandler = -1;
4042
Guido van Rossumd57fd912000-03-10 22:53:23 +00004043 if (mapping == NULL) {
4044 PyErr_BadArgument();
4045 return NULL;
4046 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047
4048 /* allocate enough for a simple 1:1 translation without
4049 replacements, if we need more, we'll resize */
4050 res = PyUnicode_FromUnicode(NULL, size);
4051 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004052 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054 return res;
4055 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057 while (p<endp) {
4058 /* try to encode it */
4059 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004060 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 goto onError;
4063 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004064 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 if (x!=Py_None) /* it worked => adjust input pointer */
4066 ++p;
4067 else { /* untranslatable character */
4068 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004069 Py_ssize_t repsize;
4070 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 Py_UNICODE *uni2;
4072 /* startpos for collecting untranslatable chars */
4073 const Py_UNICODE *collstart = p;
4074 const Py_UNICODE *collend = p+1;
4075 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077 /* find all untranslatable characters */
4078 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004079 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 goto onError;
4081 Py_XDECREF(x);
4082 if (x!=Py_None)
4083 break;
4084 ++collend;
4085 }
4086 /* cache callback name lookup
4087 * (if not done yet, i.e. it's the first error) */
4088 if (known_errorHandler==-1) {
4089 if ((errors==NULL) || (!strcmp(errors, "strict")))
4090 known_errorHandler = 1;
4091 else if (!strcmp(errors, "replace"))
4092 known_errorHandler = 2;
4093 else if (!strcmp(errors, "ignore"))
4094 known_errorHandler = 3;
4095 else if (!strcmp(errors, "xmlcharrefreplace"))
4096 known_errorHandler = 4;
4097 else
4098 known_errorHandler = 0;
4099 }
4100 switch (known_errorHandler) {
4101 case 1: /* strict */
4102 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4103 goto onError;
4104 case 2: /* replace */
4105 /* No need to check for space, this is a 1:1 replacement */
4106 for (coll = collstart; coll<collend; ++coll)
4107 *str++ = '?';
4108 /* fall through */
4109 case 3: /* ignore */
4110 p = collend;
4111 break;
4112 case 4: /* xmlcharrefreplace */
4113 /* generate replacement (temporarily (mis)uses p) */
4114 for (p = collstart; p < collend; ++p) {
4115 char buffer[2+29+1+1];
4116 char *cp;
4117 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004118 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4120 goto onError;
4121 for (cp = buffer; *cp; ++cp)
4122 *str++ = *cp;
4123 }
4124 p = collend;
4125 break;
4126 default:
4127 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4128 reason, startp, size, &exc,
4129 collstart-startp, collend-startp, &newpos);
4130 if (repunicode == NULL)
4131 goto onError;
4132 /* generate replacement */
4133 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004134 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4136 Py_DECREF(repunicode);
4137 goto onError;
4138 }
4139 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4140 *str++ = *uni2;
4141 p = startp + newpos;
4142 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143 }
4144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146 /* Resize if we allocated to much */
4147 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004148 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004149 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004150 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004151 }
4152 Py_XDECREF(exc);
4153 Py_XDECREF(errorHandler);
4154 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 onError:
4157 Py_XDECREF(res);
4158 Py_XDECREF(exc);
4159 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160 return NULL;
4161}
4162
4163PyObject *PyUnicode_Translate(PyObject *str,
4164 PyObject *mapping,
4165 const char *errors)
4166{
4167 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004168
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 str = PyUnicode_FromObject(str);
4170 if (str == NULL)
4171 goto onError;
4172 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4173 PyUnicode_GET_SIZE(str),
4174 mapping,
4175 errors);
4176 Py_DECREF(str);
4177 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004178
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 onError:
4180 Py_XDECREF(str);
4181 return NULL;
4182}
Tim Petersced69f82003-09-16 20:30:58 +00004183
Guido van Rossum9e896b32000-04-05 20:11:21 +00004184/* --- Decimal Encoder ---------------------------------------------------- */
4185
4186int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004187 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004188 char *output,
4189 const char *errors)
4190{
4191 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 PyObject *errorHandler = NULL;
4193 PyObject *exc = NULL;
4194 const char *encoding = "decimal";
4195 const char *reason = "invalid decimal Unicode string";
4196 /* the following variable is used for caching string comparisons
4197 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4198 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004199
4200 if (output == NULL) {
4201 PyErr_BadArgument();
4202 return -1;
4203 }
4204
4205 p = s;
4206 end = s + length;
4207 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004209 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004211 Py_ssize_t repsize;
4212 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 Py_UNICODE *uni2;
4214 Py_UNICODE *collstart;
4215 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004216
Guido van Rossum9e896b32000-04-05 20:11:21 +00004217 if (Py_UNICODE_ISSPACE(ch)) {
4218 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004220 continue;
4221 }
4222 decimal = Py_UNICODE_TODECIMAL(ch);
4223 if (decimal >= 0) {
4224 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004225 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004226 continue;
4227 }
Guido van Rossumba477042000-04-06 18:18:10 +00004228 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004229 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004231 continue;
4232 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 /* All other characters are considered unencodable */
4234 collstart = p;
4235 collend = p+1;
4236 while (collend < end) {
4237 if ((0 < *collend && *collend < 256) ||
4238 !Py_UNICODE_ISSPACE(*collend) ||
4239 Py_UNICODE_TODECIMAL(*collend))
4240 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004241 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242 /* cache callback name lookup
4243 * (if not done yet, i.e. it's the first error) */
4244 if (known_errorHandler==-1) {
4245 if ((errors==NULL) || (!strcmp(errors, "strict")))
4246 known_errorHandler = 1;
4247 else if (!strcmp(errors, "replace"))
4248 known_errorHandler = 2;
4249 else if (!strcmp(errors, "ignore"))
4250 known_errorHandler = 3;
4251 else if (!strcmp(errors, "xmlcharrefreplace"))
4252 known_errorHandler = 4;
4253 else
4254 known_errorHandler = 0;
4255 }
4256 switch (known_errorHandler) {
4257 case 1: /* strict */
4258 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4259 goto onError;
4260 case 2: /* replace */
4261 for (p = collstart; p < collend; ++p)
4262 *output++ = '?';
4263 /* fall through */
4264 case 3: /* ignore */
4265 p = collend;
4266 break;
4267 case 4: /* xmlcharrefreplace */
4268 /* generate replacement (temporarily (mis)uses p) */
4269 for (p = collstart; p < collend; ++p)
4270 output += sprintf(output, "&#%d;", (int)*p);
4271 p = collend;
4272 break;
4273 default:
4274 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4275 encoding, reason, s, length, &exc,
4276 collstart-s, collend-s, &newpos);
4277 if (repunicode == NULL)
4278 goto onError;
4279 /* generate replacement */
4280 repsize = PyUnicode_GET_SIZE(repunicode);
4281 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4282 Py_UNICODE ch = *uni2;
4283 if (Py_UNICODE_ISSPACE(ch))
4284 *output++ = ' ';
4285 else {
4286 decimal = Py_UNICODE_TODECIMAL(ch);
4287 if (decimal >= 0)
4288 *output++ = '0' + decimal;
4289 else if (0 < ch && ch < 256)
4290 *output++ = (char)ch;
4291 else {
4292 Py_DECREF(repunicode);
4293 raise_encode_exception(&exc, encoding,
4294 s, length, collstart-s, collend-s, reason);
4295 goto onError;
4296 }
4297 }
4298 }
4299 p = s + newpos;
4300 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004301 }
4302 }
4303 /* 0-terminate the output string */
4304 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305 Py_XDECREF(exc);
4306 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004307 return 0;
4308
4309 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 Py_XDECREF(exc);
4311 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004312 return -1;
4313}
4314
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315/* --- Helpers ------------------------------------------------------------ */
4316
Fredrik Lundha50d2012006-05-26 17:04:58 +00004317#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004318
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004319#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004320#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004321#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004322
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004323Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004324STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4325{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004326 if (str[0] != other[0])
4327 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004328 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4329}
4330
Fredrik Lundhb9479482006-05-26 17:22:38 +00004331#define STRINGLIB_EMPTY unicode_empty
4332
Fredrik Lundha50d2012006-05-26 17:04:58 +00004333#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004334
4335#include "stringlib/count.h"
4336#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004337#include "stringlib/partition.h"
4338
Fredrik Lundhc8162812006-05-26 19:33:03 +00004339/* helper macro to fixup start/end slice values */
4340#define FIX_START_END(obj) \
4341 if (start < 0) \
4342 start += (obj)->length; \
4343 if (start < 0) \
4344 start = 0; \
4345 if (end > (obj)->length) \
4346 end = (obj)->length; \
4347 if (end < 0) \
4348 end += (obj)->length; \
4349 if (end < 0) \
4350 end = 0;
4351
Martin v. Löwis18e16552006-02-15 17:27:45 +00004352Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004353 PyObject *substr,
4354 Py_ssize_t start,
4355 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004357 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004358 PyUnicodeObject* str_obj;
4359 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004360
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004361 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4362 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004364 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4365 if (!sub_obj) {
4366 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004367 return -1;
4368 }
Tim Petersced69f82003-09-16 20:30:58 +00004369
Fredrik Lundhc8162812006-05-26 19:33:03 +00004370 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004371
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004372 result = stringlib_count(
4373 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4374 );
4375
4376 Py_DECREF(sub_obj);
4377 Py_DECREF(str_obj);
4378
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379 return result;
4380}
4381
Martin v. Löwis18e16552006-02-15 17:27:45 +00004382Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004383 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004384 Py_ssize_t start,
4385 Py_ssize_t end,
4386 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004388 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004389
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004390 str = PyUnicode_FromObject(str);
4391 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004392 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004393 sub = PyUnicode_FromObject(sub);
4394 if (!sub) {
4395 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004396 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397 }
Tim Petersced69f82003-09-16 20:30:58 +00004398
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004399 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004400 result = stringlib_find_slice(
4401 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4402 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4403 start, end
4404 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004405 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004406 result = stringlib_rfind_slice(
4407 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4408 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4409 start, end
4410 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004411
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004412 Py_DECREF(str);
4413 Py_DECREF(sub);
4414
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415 return result;
4416}
4417
Tim Petersced69f82003-09-16 20:30:58 +00004418static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419int tailmatch(PyUnicodeObject *self,
4420 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004421 Py_ssize_t start,
4422 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423 int direction)
4424{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425 if (substring->length == 0)
4426 return 1;
4427
Fredrik Lundhc8162812006-05-26 19:33:03 +00004428 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429
4430 end -= substring->length;
4431 if (end < start)
4432 return 0;
4433
4434 if (direction > 0) {
4435 if (Py_UNICODE_MATCH(self, end, substring))
4436 return 1;
4437 } else {
4438 if (Py_UNICODE_MATCH(self, start, substring))
4439 return 1;
4440 }
4441
4442 return 0;
4443}
4444
Martin v. Löwis18e16552006-02-15 17:27:45 +00004445Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004447 Py_ssize_t start,
4448 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449 int direction)
4450{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004451 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004452
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453 str = PyUnicode_FromObject(str);
4454 if (str == NULL)
4455 return -1;
4456 substr = PyUnicode_FromObject(substr);
4457 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004458 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 return -1;
4460 }
Tim Petersced69f82003-09-16 20:30:58 +00004461
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 result = tailmatch((PyUnicodeObject *)str,
4463 (PyUnicodeObject *)substr,
4464 start, end, direction);
4465 Py_DECREF(str);
4466 Py_DECREF(substr);
4467 return result;
4468}
4469
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470/* Apply fixfct filter to the Unicode object self and return a
4471 reference to the modified object */
4472
Tim Petersced69f82003-09-16 20:30:58 +00004473static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474PyObject *fixup(PyUnicodeObject *self,
4475 int (*fixfct)(PyUnicodeObject *s))
4476{
4477
4478 PyUnicodeObject *u;
4479
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004480 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 if (u == NULL)
4482 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004483
4484 Py_UNICODE_COPY(u->str, self->str, self->length);
4485
Tim Peters7a29bd52001-09-12 03:03:31 +00004486 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487 /* fixfct should return TRUE if it modified the buffer. If
4488 FALSE, return a reference to the original buffer instead
4489 (to save space, not time) */
4490 Py_INCREF(self);
4491 Py_DECREF(u);
4492 return (PyObject*) self;
4493 }
4494 return (PyObject*) u;
4495}
4496
Tim Petersced69f82003-09-16 20:30:58 +00004497static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498int fixupper(PyUnicodeObject *self)
4499{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004500 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501 Py_UNICODE *s = self->str;
4502 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004503
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 while (len-- > 0) {
4505 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004506
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507 ch = Py_UNICODE_TOUPPER(*s);
4508 if (ch != *s) {
4509 status = 1;
4510 *s = ch;
4511 }
4512 s++;
4513 }
4514
4515 return status;
4516}
4517
Tim Petersced69f82003-09-16 20:30:58 +00004518static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519int fixlower(PyUnicodeObject *self)
4520{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004521 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522 Py_UNICODE *s = self->str;
4523 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004524
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525 while (len-- > 0) {
4526 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004527
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 ch = Py_UNICODE_TOLOWER(*s);
4529 if (ch != *s) {
4530 status = 1;
4531 *s = ch;
4532 }
4533 s++;
4534 }
4535
4536 return status;
4537}
4538
Tim Petersced69f82003-09-16 20:30:58 +00004539static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540int fixswapcase(PyUnicodeObject *self)
4541{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004542 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 Py_UNICODE *s = self->str;
4544 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004545
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546 while (len-- > 0) {
4547 if (Py_UNICODE_ISUPPER(*s)) {
4548 *s = Py_UNICODE_TOLOWER(*s);
4549 status = 1;
4550 } else if (Py_UNICODE_ISLOWER(*s)) {
4551 *s = Py_UNICODE_TOUPPER(*s);
4552 status = 1;
4553 }
4554 s++;
4555 }
4556
4557 return status;
4558}
4559
Tim Petersced69f82003-09-16 20:30:58 +00004560static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561int fixcapitalize(PyUnicodeObject *self)
4562{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004563 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004564 Py_UNICODE *s = self->str;
4565 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004566
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004567 if (len == 0)
4568 return 0;
4569 if (Py_UNICODE_ISLOWER(*s)) {
4570 *s = Py_UNICODE_TOUPPER(*s);
4571 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004573 s++;
4574 while (--len > 0) {
4575 if (Py_UNICODE_ISUPPER(*s)) {
4576 *s = Py_UNICODE_TOLOWER(*s);
4577 status = 1;
4578 }
4579 s++;
4580 }
4581 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582}
4583
4584static
4585int fixtitle(PyUnicodeObject *self)
4586{
4587 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4588 register Py_UNICODE *e;
4589 int previous_is_cased;
4590
4591 /* Shortcut for single character strings */
4592 if (PyUnicode_GET_SIZE(self) == 1) {
4593 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4594 if (*p != ch) {
4595 *p = ch;
4596 return 1;
4597 }
4598 else
4599 return 0;
4600 }
Tim Petersced69f82003-09-16 20:30:58 +00004601
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602 e = p + PyUnicode_GET_SIZE(self);
4603 previous_is_cased = 0;
4604 for (; p < e; p++) {
4605 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004606
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607 if (previous_is_cased)
4608 *p = Py_UNICODE_TOLOWER(ch);
4609 else
4610 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004611
4612 if (Py_UNICODE_ISLOWER(ch) ||
4613 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614 Py_UNICODE_ISTITLE(ch))
4615 previous_is_cased = 1;
4616 else
4617 previous_is_cased = 0;
4618 }
4619 return 1;
4620}
4621
Tim Peters8ce9f162004-08-27 01:49:32 +00004622PyObject *
4623PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624{
Tim Peters8ce9f162004-08-27 01:49:32 +00004625 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004626 const Py_UNICODE blank = ' ';
4627 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004628 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004629 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004630 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4631 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004632 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4633 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004634 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004635 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004636 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637
Tim Peters05eba1f2004-08-27 21:32:02 +00004638 fseq = PySequence_Fast(seq, "");
4639 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004640 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004641 }
4642
Tim Peters91879ab2004-08-27 22:35:44 +00004643 /* Grrrr. A codec may be invoked to convert str objects to
4644 * Unicode, and so it's possible to call back into Python code
4645 * during PyUnicode_FromObject(), and so it's possible for a sick
4646 * codec to change the size of fseq (if seq is a list). Therefore
4647 * we have to keep refetching the size -- can't assume seqlen
4648 * is invariant.
4649 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004650 seqlen = PySequence_Fast_GET_SIZE(fseq);
4651 /* If empty sequence, return u"". */
4652 if (seqlen == 0) {
4653 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4654 goto Done;
4655 }
4656 /* If singleton sequence with an exact Unicode, return that. */
4657 if (seqlen == 1) {
4658 item = PySequence_Fast_GET_ITEM(fseq, 0);
4659 if (PyUnicode_CheckExact(item)) {
4660 Py_INCREF(item);
4661 res = (PyUnicodeObject *)item;
4662 goto Done;
4663 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004664 }
4665
Tim Peters05eba1f2004-08-27 21:32:02 +00004666 /* At least two items to join, or one that isn't exact Unicode. */
4667 if (seqlen > 1) {
4668 /* Set up sep and seplen -- they're needed. */
4669 if (separator == NULL) {
4670 sep = &blank;
4671 seplen = 1;
4672 }
4673 else {
4674 internal_separator = PyUnicode_FromObject(separator);
4675 if (internal_separator == NULL)
4676 goto onError;
4677 sep = PyUnicode_AS_UNICODE(internal_separator);
4678 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004679 /* In case PyUnicode_FromObject() mutated seq. */
4680 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004681 }
4682 }
4683
4684 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004685 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004686 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004687 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004688 res_p = PyUnicode_AS_UNICODE(res);
4689 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004690
Tim Peters05eba1f2004-08-27 21:32:02 +00004691 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004692 Py_ssize_t itemlen;
4693 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004694
4695 item = PySequence_Fast_GET_ITEM(fseq, i);
4696 /* Convert item to Unicode. */
4697 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4698 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004699 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004700 " %.80s found",
4701 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004702 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004703 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004704 item = PyUnicode_FromObject(item);
4705 if (item == NULL)
4706 goto onError;
4707 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004708
Tim Peters91879ab2004-08-27 22:35:44 +00004709 /* In case PyUnicode_FromObject() mutated seq. */
4710 seqlen = PySequence_Fast_GET_SIZE(fseq);
4711
Tim Peters8ce9f162004-08-27 01:49:32 +00004712 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004714 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004715 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004716 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004717 if (i < seqlen - 1) {
4718 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004719 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004720 goto Overflow;
4721 }
4722 if (new_res_used > res_alloc) {
4723 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004724 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004725 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004726 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004727 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004728 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004729 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004730 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004732 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004733 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004735
4736 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004737 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004738 res_p += itemlen;
4739 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004740 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004741 res_p += seplen;
4742 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004744 res_used = new_res_used;
4745 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004746
Tim Peters05eba1f2004-08-27 21:32:02 +00004747 /* Shrink res to match the used area; this probably can't fail,
4748 * but it's cheap to check.
4749 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004750 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004751 goto onError;
4752
4753 Done:
4754 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004755 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756 return (PyObject *)res;
4757
Tim Peters8ce9f162004-08-27 01:49:32 +00004758 Overflow:
4759 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004760 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004761 Py_DECREF(item);
4762 /* fall through */
4763
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004765 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004766 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004767 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 return NULL;
4769}
4770
Tim Petersced69f82003-09-16 20:30:58 +00004771static
4772PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004773 Py_ssize_t left,
4774 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 Py_UNICODE fill)
4776{
4777 PyUnicodeObject *u;
4778
4779 if (left < 0)
4780 left = 0;
4781 if (right < 0)
4782 right = 0;
4783
Tim Peters7a29bd52001-09-12 03:03:31 +00004784 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 Py_INCREF(self);
4786 return self;
4787 }
4788
Neal Norwitz4f3be8a2008-07-31 17:08:14 +00004789 if (left > PY_SSIZE_T_MAX - self->length ||
4790 right > PY_SSIZE_T_MAX - (left + self->length)) {
4791 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
4792 return NULL;
4793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 u = _PyUnicode_New(left + self->length + right);
4795 if (u) {
4796 if (left)
4797 Py_UNICODE_FILL(u->str, fill, left);
4798 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4799 if (right)
4800 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4801 }
4802
4803 return u;
4804}
4805
4806#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004807 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808 if (!str) \
4809 goto onError; \
4810 if (PyList_Append(list, str)) { \
4811 Py_DECREF(str); \
4812 goto onError; \
4813 } \
4814 else \
4815 Py_DECREF(str);
4816
4817static
4818PyObject *split_whitespace(PyUnicodeObject *self,
4819 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004820 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004822 register Py_ssize_t i;
4823 register Py_ssize_t j;
4824 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825 PyObject *str;
4826
4827 for (i = j = 0; i < len; ) {
4828 /* find a token */
4829 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4830 i++;
4831 j = i;
4832 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4833 i++;
4834 if (j < i) {
4835 if (maxcount-- <= 0)
4836 break;
4837 SPLIT_APPEND(self->str, j, i);
4838 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4839 i++;
4840 j = i;
4841 }
4842 }
4843 if (j < len) {
4844 SPLIT_APPEND(self->str, j, len);
4845 }
4846 return list;
4847
4848 onError:
4849 Py_DECREF(list);
4850 return NULL;
4851}
4852
4853PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004854 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004856 register Py_ssize_t i;
4857 register Py_ssize_t j;
4858 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 PyObject *list;
4860 PyObject *str;
4861 Py_UNICODE *data;
4862
4863 string = PyUnicode_FromObject(string);
4864 if (string == NULL)
4865 return NULL;
4866 data = PyUnicode_AS_UNICODE(string);
4867 len = PyUnicode_GET_SIZE(string);
4868
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869 list = PyList_New(0);
4870 if (!list)
4871 goto onError;
4872
4873 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004874 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004875
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004877 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879
4880 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004881 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 if (i < len) {
4883 if (data[i] == '\r' && i + 1 < len &&
4884 data[i+1] == '\n')
4885 i += 2;
4886 else
4887 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004888 if (keepends)
4889 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 }
Guido van Rossum86662912000-04-11 15:38:46 +00004891 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 j = i;
4893 }
4894 if (j < len) {
4895 SPLIT_APPEND(data, j, len);
4896 }
4897
4898 Py_DECREF(string);
4899 return list;
4900
4901 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004902 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903 Py_DECREF(string);
4904 return NULL;
4905}
4906
Tim Petersced69f82003-09-16 20:30:58 +00004907static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908PyObject *split_char(PyUnicodeObject *self,
4909 PyObject *list,
4910 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004911 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004913 register Py_ssize_t i;
4914 register Py_ssize_t j;
4915 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916 PyObject *str;
4917
4918 for (i = j = 0; i < len; ) {
4919 if (self->str[i] == ch) {
4920 if (maxcount-- <= 0)
4921 break;
4922 SPLIT_APPEND(self->str, j, i);
4923 i = j = i + 1;
4924 } else
4925 i++;
4926 }
4927 if (j <= len) {
4928 SPLIT_APPEND(self->str, j, len);
4929 }
4930 return list;
4931
4932 onError:
4933 Py_DECREF(list);
4934 return NULL;
4935}
4936
Tim Petersced69f82003-09-16 20:30:58 +00004937static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938PyObject *split_substring(PyUnicodeObject *self,
4939 PyObject *list,
4940 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004941 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004943 register Py_ssize_t i;
4944 register Py_ssize_t j;
4945 Py_ssize_t len = self->length;
4946 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947 PyObject *str;
4948
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004949 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950 if (Py_UNICODE_MATCH(self, i, substring)) {
4951 if (maxcount-- <= 0)
4952 break;
4953 SPLIT_APPEND(self->str, j, i);
4954 i = j = i + sublen;
4955 } else
4956 i++;
4957 }
4958 if (j <= len) {
4959 SPLIT_APPEND(self->str, j, len);
4960 }
4961 return list;
4962
4963 onError:
4964 Py_DECREF(list);
4965 return NULL;
4966}
4967
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004968static
4969PyObject *rsplit_whitespace(PyUnicodeObject *self,
4970 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004971 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004972{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004973 register Py_ssize_t i;
4974 register Py_ssize_t j;
4975 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004976 PyObject *str;
4977
4978 for (i = j = len - 1; i >= 0; ) {
4979 /* find a token */
4980 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4981 i--;
4982 j = i;
4983 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4984 i--;
4985 if (j > i) {
4986 if (maxcount-- <= 0)
4987 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004988 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004989 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4990 i--;
4991 j = i;
4992 }
4993 }
4994 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004995 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004996 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004997 if (PyList_Reverse(list) < 0)
4998 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004999 return list;
5000
5001 onError:
5002 Py_DECREF(list);
5003 return NULL;
5004}
5005
5006static
5007PyObject *rsplit_char(PyUnicodeObject *self,
5008 PyObject *list,
5009 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005010 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005011{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005012 register Py_ssize_t i;
5013 register Py_ssize_t j;
5014 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005015 PyObject *str;
5016
5017 for (i = j = len - 1; i >= 0; ) {
5018 if (self->str[i] == ch) {
5019 if (maxcount-- <= 0)
5020 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005021 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005022 j = i = i - 1;
5023 } else
5024 i--;
5025 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005026 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005027 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005028 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005029 if (PyList_Reverse(list) < 0)
5030 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005031 return list;
5032
5033 onError:
5034 Py_DECREF(list);
5035 return NULL;
5036}
5037
5038static
5039PyObject *rsplit_substring(PyUnicodeObject *self,
5040 PyObject *list,
5041 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005042 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005043{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005044 register Py_ssize_t i;
5045 register Py_ssize_t j;
5046 Py_ssize_t len = self->length;
5047 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005048 PyObject *str;
5049
5050 for (i = len - sublen, j = len; i >= 0; ) {
5051 if (Py_UNICODE_MATCH(self, i, substring)) {
5052 if (maxcount-- <= 0)
5053 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005054 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005055 j = i;
5056 i -= sublen;
5057 } else
5058 i--;
5059 }
5060 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005061 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005062 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005063 if (PyList_Reverse(list) < 0)
5064 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005065 return list;
5066
5067 onError:
5068 Py_DECREF(list);
5069 return NULL;
5070}
5071
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072#undef SPLIT_APPEND
5073
5074static
5075PyObject *split(PyUnicodeObject *self,
5076 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005077 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078{
5079 PyObject *list;
5080
5081 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005082 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083
5084 list = PyList_New(0);
5085 if (!list)
5086 return NULL;
5087
5088 if (substring == NULL)
5089 return split_whitespace(self,list,maxcount);
5090
5091 else if (substring->length == 1)
5092 return split_char(self,list,substring->str[0],maxcount);
5093
5094 else if (substring->length == 0) {
5095 Py_DECREF(list);
5096 PyErr_SetString(PyExc_ValueError, "empty separator");
5097 return NULL;
5098 }
5099 else
5100 return split_substring(self,list,substring,maxcount);
5101}
5102
Tim Petersced69f82003-09-16 20:30:58 +00005103static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005104PyObject *rsplit(PyUnicodeObject *self,
5105 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005106 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005107{
5108 PyObject *list;
5109
5110 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005111 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005112
5113 list = PyList_New(0);
5114 if (!list)
5115 return NULL;
5116
5117 if (substring == NULL)
5118 return rsplit_whitespace(self,list,maxcount);
5119
5120 else if (substring->length == 1)
5121 return rsplit_char(self,list,substring->str[0],maxcount);
5122
5123 else if (substring->length == 0) {
5124 Py_DECREF(list);
5125 PyErr_SetString(PyExc_ValueError, "empty separator");
5126 return NULL;
5127 }
5128 else
5129 return rsplit_substring(self,list,substring,maxcount);
5130}
5131
5132static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133PyObject *replace(PyUnicodeObject *self,
5134 PyUnicodeObject *str1,
5135 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005136 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137{
5138 PyUnicodeObject *u;
5139
5140 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005141 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142
Fredrik Lundh347ee272006-05-24 16:35:18 +00005143 if (str1->length == str2->length) {
5144 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005145 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005146 if (str1->length == 1) {
5147 /* replace characters */
5148 Py_UNICODE u1, u2;
5149 if (!findchar(self->str, self->length, str1->str[0]))
5150 goto nothing;
5151 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5152 if (!u)
5153 return NULL;
5154 Py_UNICODE_COPY(u->str, self->str, self->length);
5155 u1 = str1->str[0];
5156 u2 = str2->str[0];
5157 for (i = 0; i < u->length; i++)
5158 if (u->str[i] == u1) {
5159 if (--maxcount < 0)
5160 break;
5161 u->str[i] = u2;
5162 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005164 i = fastsearch(
5165 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005167 if (i < 0)
5168 goto nothing;
5169 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5170 if (!u)
5171 return NULL;
5172 Py_UNICODE_COPY(u->str, self->str, self->length);
5173 while (i <= self->length - str1->length)
5174 if (Py_UNICODE_MATCH(self, i, str1)) {
5175 if (--maxcount < 0)
5176 break;
5177 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5178 i += str1->length;
5179 } else
5180 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005183
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005184 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005185 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 Py_UNICODE *p;
5187
5188 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005189 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 if (n > maxcount)
5191 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005192 if (n == 0)
5193 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005194 /* new_size = self->length + n * (str2->length - str1->length)); */
5195 delta = (str2->length - str1->length);
5196 if (delta == 0) {
5197 new_size = self->length;
5198 } else {
5199 product = n * (str2->length - str1->length);
5200 if ((product / (str2->length - str1->length)) != n) {
5201 PyErr_SetString(PyExc_OverflowError,
5202 "replace string is too long");
5203 return NULL;
5204 }
5205 new_size = self->length + product;
5206 if (new_size < 0) {
5207 PyErr_SetString(PyExc_OverflowError,
5208 "replace string is too long");
5209 return NULL;
5210 }
5211 }
5212 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005213 if (!u)
5214 return NULL;
5215 i = 0;
5216 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005217 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005218 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005219 while (n-- > 0) {
5220 /* look for next match */
5221 j = i;
5222 while (j <= e) {
5223 if (Py_UNICODE_MATCH(self, j, str1))
5224 break;
5225 j++;
5226 }
5227 if (j > i) {
5228 if (j > e)
5229 break;
5230 /* copy unchanged part [i:j] */
5231 Py_UNICODE_COPY(p, self->str+i, j-i);
5232 p += j - i;
5233 }
5234 /* copy substitution string */
5235 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005236 Py_UNICODE_COPY(p, str2->str, str2->length);
5237 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005238 }
5239 i = j + str1->length;
5240 }
5241 if (i < self->length)
5242 /* copy tail [i:] */
5243 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005244 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005245 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005246 while (n > 0) {
5247 Py_UNICODE_COPY(p, str2->str, str2->length);
5248 p += str2->length;
5249 if (--n <= 0)
5250 break;
5251 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005253 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 }
5255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005257
5258nothing:
5259 /* nothing to replace; return original string (when possible) */
5260 if (PyUnicode_CheckExact(self)) {
5261 Py_INCREF(self);
5262 return (PyObject *) self;
5263 }
5264 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265}
5266
5267/* --- Unicode Object Methods --------------------------------------------- */
5268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005269PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270"S.title() -> unicode\n\
5271\n\
5272Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005273characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274
5275static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005276unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 return fixup(self, fixtitle);
5279}
5280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005281PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282"S.capitalize() -> unicode\n\
5283\n\
5284Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005285have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286
5287static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005288unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 return fixup(self, fixcapitalize);
5291}
5292
5293#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005294PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295"S.capwords() -> unicode\n\
5296\n\
5297Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005298normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299
5300static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005301unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302{
5303 PyObject *list;
5304 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005305 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 /* Split into words */
5308 list = split(self, NULL, -1);
5309 if (!list)
5310 return NULL;
5311
5312 /* Capitalize each word */
5313 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5314 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5315 fixcapitalize);
5316 if (item == NULL)
5317 goto onError;
5318 Py_DECREF(PyList_GET_ITEM(list, i));
5319 PyList_SET_ITEM(list, i, item);
5320 }
5321
5322 /* Join the words to form a new string */
5323 item = PyUnicode_Join(NULL, list);
5324
5325onError:
5326 Py_DECREF(list);
5327 return (PyObject *)item;
5328}
5329#endif
5330
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005331/* Argument converter. Coerces to a single unicode character */
5332
5333static int
5334convert_uc(PyObject *obj, void *addr)
5335{
5336 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5337 PyObject *uniobj;
5338 Py_UNICODE *unistr;
5339
5340 uniobj = PyUnicode_FromObject(obj);
5341 if (uniobj == NULL) {
5342 PyErr_SetString(PyExc_TypeError,
5343 "The fill character cannot be converted to Unicode");
5344 return 0;
5345 }
5346 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5347 PyErr_SetString(PyExc_TypeError,
5348 "The fill character must be exactly one character long");
5349 Py_DECREF(uniobj);
5350 return 0;
5351 }
5352 unistr = PyUnicode_AS_UNICODE(uniobj);
5353 *fillcharloc = unistr[0];
5354 Py_DECREF(uniobj);
5355 return 1;
5356}
5357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005358PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005359"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005361Return S centered in a Unicode string of length width. Padding is\n\
5362done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363
5364static PyObject *
5365unicode_center(PyUnicodeObject *self, PyObject *args)
5366{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005367 Py_ssize_t marg, left;
5368 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005369 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370
Thomas Woutersde017742006-02-16 19:34:37 +00005371 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 return NULL;
5373
Tim Peters7a29bd52001-09-12 03:03:31 +00005374 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 Py_INCREF(self);
5376 return (PyObject*) self;
5377 }
5378
5379 marg = width - self->length;
5380 left = marg / 2 + (marg & width & 1);
5381
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005382 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383}
5384
Marc-André Lemburge5034372000-08-08 08:04:29 +00005385#if 0
5386
5387/* This code should go into some future Unicode collation support
5388 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005389 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005390
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005391/* speedy UTF-16 code point order comparison */
5392/* gleaned from: */
5393/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5394
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005395static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005396{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005397 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005398 0, 0, 0, 0, 0, 0, 0, 0,
5399 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005400 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005401};
5402
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403static int
5404unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5405{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005406 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005407
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 Py_UNICODE *s1 = str1->str;
5409 Py_UNICODE *s2 = str2->str;
5410
5411 len1 = str1->length;
5412 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005413
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005415 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005416
5417 c1 = *s1++;
5418 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005419
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005420 if (c1 > (1<<11) * 26)
5421 c1 += utf16Fixup[c1>>11];
5422 if (c2 > (1<<11) * 26)
5423 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005424 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005425
5426 if (c1 != c2)
5427 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005428
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005429 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 }
5431
5432 return (len1 < len2) ? -1 : (len1 != len2);
5433}
5434
Marc-André Lemburge5034372000-08-08 08:04:29 +00005435#else
5436
5437static int
5438unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5439{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005440 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005441
5442 Py_UNICODE *s1 = str1->str;
5443 Py_UNICODE *s2 = str2->str;
5444
5445 len1 = str1->length;
5446 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005447
Marc-André Lemburge5034372000-08-08 08:04:29 +00005448 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005449 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005450
Fredrik Lundh45714e92001-06-26 16:39:36 +00005451 c1 = *s1++;
5452 c2 = *s2++;
5453
5454 if (c1 != c2)
5455 return (c1 < c2) ? -1 : 1;
5456
Marc-André Lemburge5034372000-08-08 08:04:29 +00005457 len1--; len2--;
5458 }
5459
5460 return (len1 < len2) ? -1 : (len1 != len2);
5461}
5462
5463#endif
5464
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465int PyUnicode_Compare(PyObject *left,
5466 PyObject *right)
5467{
5468 PyUnicodeObject *u = NULL, *v = NULL;
5469 int result;
5470
5471 /* Coerce the two arguments */
5472 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5473 if (u == NULL)
5474 goto onError;
5475 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5476 if (v == NULL)
5477 goto onError;
5478
Thomas Wouters7e474022000-07-16 12:04:32 +00005479 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 if (v == u) {
5481 Py_DECREF(u);
5482 Py_DECREF(v);
5483 return 0;
5484 }
5485
5486 result = unicode_compare(u, v);
5487
5488 Py_DECREF(u);
5489 Py_DECREF(v);
5490 return result;
5491
5492onError:
5493 Py_XDECREF(u);
5494 Py_XDECREF(v);
5495 return -1;
5496}
5497
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00005498PyObject *PyUnicode_RichCompare(PyObject *left,
5499 PyObject *right,
5500 int op)
5501{
5502 int result;
5503
5504 result = PyUnicode_Compare(left, right);
5505 if (result == -1 && PyErr_Occurred())
5506 goto onError;
5507
5508 /* Convert the return value to a Boolean */
5509 switch (op) {
5510 case Py_EQ:
5511 result = (result == 0);
5512 break;
5513 case Py_NE:
5514 result = (result != 0);
5515 break;
5516 case Py_LE:
5517 result = (result <= 0);
5518 break;
5519 case Py_GE:
5520 result = (result >= 0);
5521 break;
5522 case Py_LT:
5523 result = (result == -1);
5524 break;
5525 case Py_GT:
5526 result = (result == 1);
5527 break;
5528 }
5529 return PyBool_FromLong(result);
5530
5531 onError:
5532
5533 /* Standard case
5534
5535 Type errors mean that PyUnicode_FromObject() could not convert
5536 one of the arguments (usually the right hand side) to Unicode,
5537 ie. we can't handle the comparison request. However, it is
5538 possible that the other object knows a comparison method, which
5539 is why we return Py_NotImplemented to give the other object a
5540 chance.
5541
5542 */
5543 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5544 PyErr_Clear();
5545 Py_INCREF(Py_NotImplemented);
5546 return Py_NotImplemented;
5547 }
5548 if (op != Py_EQ && op != Py_NE)
5549 return NULL;
5550
5551 /* Equality comparison.
5552
5553 This is a special case: we silence any PyExc_UnicodeDecodeError
5554 and instead turn it into a PyErr_UnicodeWarning.
5555
5556 */
5557 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5558 return NULL;
5559 PyErr_Clear();
5560 if (PyErr_Warn(PyExc_UnicodeWarning,
5561 (op == Py_EQ) ?
5562 "Unicode equal comparison "
5563 "failed to convert both arguments to Unicode - "
5564 "interpreting them as being unequal" :
5565 "Unicode unequal comparison "
5566 "failed to convert both arguments to Unicode - "
5567 "interpreting them as being unequal"
5568 ) < 0)
5569 return NULL;
5570 result = (op == Py_NE);
5571 return PyBool_FromLong(result);
5572}
5573
Guido van Rossum403d68b2000-03-13 15:55:09 +00005574int PyUnicode_Contains(PyObject *container,
5575 PyObject *element)
5576{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005577 PyObject *str, *sub;
5578 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005579
5580 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005581 sub = PyUnicode_FromObject(element);
5582 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005583 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005584 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005585 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005586 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005587
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005588 str = PyUnicode_FromObject(container);
5589 if (!str) {
5590 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005591 return -1;
5592 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005593
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005594 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005595
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005596 Py_DECREF(str);
5597 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005598
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005599 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005600}
5601
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602/* Concat to string or Unicode object giving a new Unicode object. */
5603
5604PyObject *PyUnicode_Concat(PyObject *left,
5605 PyObject *right)
5606{
5607 PyUnicodeObject *u = NULL, *v = NULL, *w;
5608
5609 /* Coerce the two arguments */
5610 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5611 if (u == NULL)
5612 goto onError;
5613 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5614 if (v == NULL)
5615 goto onError;
5616
5617 /* Shortcuts */
5618 if (v == unicode_empty) {
5619 Py_DECREF(v);
5620 return (PyObject *)u;
5621 }
5622 if (u == unicode_empty) {
5623 Py_DECREF(u);
5624 return (PyObject *)v;
5625 }
5626
5627 /* Concat the two Unicode strings */
5628 w = _PyUnicode_New(u->length + v->length);
5629 if (w == NULL)
5630 goto onError;
5631 Py_UNICODE_COPY(w->str, u->str, u->length);
5632 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5633
5634 Py_DECREF(u);
5635 Py_DECREF(v);
5636 return (PyObject *)w;
5637
5638onError:
5639 Py_XDECREF(u);
5640 Py_XDECREF(v);
5641 return NULL;
5642}
5643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005644PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645"S.count(sub[, start[, end]]) -> int\n\
5646\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005647Return the number of non-overlapping occurrences of substring sub in\n\
5648Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005649interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650
5651static PyObject *
5652unicode_count(PyUnicodeObject *self, PyObject *args)
5653{
5654 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005655 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005656 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 PyObject *result;
5658
Guido van Rossumb8872e62000-05-09 14:14:27 +00005659 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5660 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 return NULL;
5662
5663 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005664 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 if (substring == NULL)
5666 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005667
Fredrik Lundhc8162812006-05-26 19:33:03 +00005668 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005670 result = PyInt_FromSsize_t(
5671 stringlib_count(self->str + start, end - start,
5672 substring->str, substring->length)
5673 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674
5675 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005676
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 return result;
5678}
5679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005680PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005681"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005683Encodes S using the codec registered for encoding. encoding defaults\n\
5684to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005685handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5687'xmlcharrefreplace' as well as any other name registered with\n\
5688codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
5690static PyObject *
5691unicode_encode(PyUnicodeObject *self, PyObject *args)
5692{
5693 char *encoding = NULL;
5694 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005695 PyObject *v;
5696
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5698 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005699 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005700 if (v == NULL)
5701 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005702 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5703 PyErr_Format(PyExc_TypeError,
5704 "encoder did not return a string/unicode object "
5705 "(type=%.400s)",
5706 v->ob_type->tp_name);
5707 Py_DECREF(v);
5708 return NULL;
5709 }
5710 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005711
5712 onError:
5713 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005714}
5715
5716PyDoc_STRVAR(decode__doc__,
5717"S.decode([encoding[,errors]]) -> string or unicode\n\
5718\n\
5719Decodes S using the codec registered for encoding. encoding defaults\n\
5720to the default encoding. errors may be given to set a different error\n\
5721handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5722a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5723as well as any other name registerd with codecs.register_error that is\n\
5724able to handle UnicodeDecodeErrors.");
5725
5726static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005727unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005728{
5729 char *encoding = NULL;
5730 char *errors = NULL;
5731 PyObject *v;
5732
5733 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5734 return NULL;
5735 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005736 if (v == NULL)
5737 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005738 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5739 PyErr_Format(PyExc_TypeError,
5740 "decoder did not return a string/unicode object "
5741 "(type=%.400s)",
5742 v->ob_type->tp_name);
5743 Py_DECREF(v);
5744 return NULL;
5745 }
5746 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005747
5748 onError:
5749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750}
5751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005752PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753"S.expandtabs([tabsize]) -> unicode\n\
5754\n\
5755Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005756If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757
5758static PyObject*
5759unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5760{
5761 Py_UNICODE *e;
5762 Py_UNICODE *p;
5763 Py_UNICODE *q;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005764 Py_UNICODE *qe;
5765 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 PyUnicodeObject *u;
5767 int tabsize = 8;
5768
5769 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5770 return NULL;
5771
Thomas Wouters7e474022000-07-16 12:04:32 +00005772 /* First pass: determine size of output string */
Guido van Rossum44a93e52008-03-11 21:14:54 +00005773 i = 0; /* chars up to and including most recent \n or \r */
5774 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
5775 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 for (p = self->str; p < e; p++)
5777 if (*p == '\t') {
Neal Norwitz66e64e22007-06-09 04:06:30 +00005778 if (tabsize > 0) {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005779 incr = tabsize - (j % tabsize); /* cannot overflow */
5780 if (j > PY_SSIZE_T_MAX - incr)
5781 goto overflow1;
5782 j += incr;
5783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 }
5785 else {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005786 if (j > PY_SSIZE_T_MAX - 1)
5787 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 j++;
5789 if (*p == '\n' || *p == '\r') {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005790 if (i > PY_SSIZE_T_MAX - j)
5791 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792 i += j;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005793 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 }
5795 }
5796
Guido van Rossum44a93e52008-03-11 21:14:54 +00005797 if (i > PY_SSIZE_T_MAX - j)
5798 goto overflow1;
Neal Norwitz66e64e22007-06-09 04:06:30 +00005799
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 /* Second pass: create output string and fill it */
5801 u = _PyUnicode_New(i + j);
5802 if (!u)
5803 return NULL;
5804
Guido van Rossum44a93e52008-03-11 21:14:54 +00005805 j = 0; /* same as in first pass */
5806 q = u->str; /* next output char */
5807 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808
5809 for (p = self->str; p < e; p++)
5810 if (*p == '\t') {
5811 if (tabsize > 0) {
5812 i = tabsize - (j % tabsize);
5813 j += i;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005814 while (i--) {
5815 if (q >= qe)
5816 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 *q++ = ' ';
Guido van Rossum44a93e52008-03-11 21:14:54 +00005818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 }
5820 }
5821 else {
Guido van Rossum44a93e52008-03-11 21:14:54 +00005822 if (q >= qe)
5823 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 *q++ = *p;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005825 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 if (*p == '\n' || *p == '\r')
5827 j = 0;
5828 }
5829
5830 return (PyObject*) u;
Guido van Rossum44a93e52008-03-11 21:14:54 +00005831
5832 overflow2:
5833 Py_DECREF(u);
5834 overflow1:
5835 PyErr_SetString(PyExc_OverflowError, "new string is too long");
5836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837}
5838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005839PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840"S.find(sub [,start [,end]]) -> int\n\
5841\n\
5842Return the lowest index in S where substring sub is found,\n\
Georg Brandlb4d100c2007-07-29 17:37:22 +00005843such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844arguments start and end are interpreted as in slice notation.\n\
5845\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005846Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847
5848static PyObject *
5849unicode_find(PyUnicodeObject *self, PyObject *args)
5850{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005851 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005852 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005853 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005854 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855
Guido van Rossumb8872e62000-05-09 14:14:27 +00005856 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5857 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005859 substring = PyUnicode_FromObject(substring);
5860 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 return NULL;
5862
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005863 result = stringlib_find_slice(
5864 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5865 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5866 start, end
5867 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868
5869 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005870
5871 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872}
5873
5874static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005875unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876{
5877 if (index < 0 || index >= self->length) {
5878 PyErr_SetString(PyExc_IndexError, "string index out of range");
5879 return NULL;
5880 }
5881
5882 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5883}
5884
5885static long
5886unicode_hash(PyUnicodeObject *self)
5887{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005888 /* Since Unicode objects compare equal to their ASCII string
5889 counterparts, they should use the individual character values
5890 as basis for their hash value. This is needed to assure that
5891 strings and Unicode objects behave in the same way as
5892 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893
Martin v. Löwis18e16552006-02-15 17:27:45 +00005894 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005895 register Py_UNICODE *p;
5896 register long x;
5897
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 if (self->hash != -1)
5899 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005900 len = PyUnicode_GET_SIZE(self);
5901 p = PyUnicode_AS_UNICODE(self);
5902 x = *p << 7;
5903 while (--len >= 0)
5904 x = (1000003*x) ^ *p++;
5905 x ^= PyUnicode_GET_SIZE(self);
5906 if (x == -1)
5907 x = -2;
5908 self->hash = x;
5909 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910}
5911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005912PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913"S.index(sub [,start [,end]]) -> int\n\
5914\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005915Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916
5917static PyObject *
5918unicode_index(PyUnicodeObject *self, PyObject *args)
5919{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005920 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005921 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005922 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005923 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924
Guido van Rossumb8872e62000-05-09 14:14:27 +00005925 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5926 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005928 substring = PyUnicode_FromObject(substring);
5929 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 return NULL;
5931
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005932 result = stringlib_find_slice(
5933 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5934 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5935 start, end
5936 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937
5938 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005939
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 if (result < 0) {
5941 PyErr_SetString(PyExc_ValueError, "substring not found");
5942 return NULL;
5943 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005944
Martin v. Löwis18e16552006-02-15 17:27:45 +00005945 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946}
5947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005948PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005949"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005951Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005952at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953
5954static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005955unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956{
5957 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5958 register const Py_UNICODE *e;
5959 int cased;
5960
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 /* Shortcut for single character strings */
5962 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005963 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005965 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005966 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005967 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005968
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 e = p + PyUnicode_GET_SIZE(self);
5970 cased = 0;
5971 for (; p < e; p++) {
5972 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005973
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005975 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 else if (!cased && Py_UNICODE_ISLOWER(ch))
5977 cased = 1;
5978 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005979 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980}
5981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005982PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005983"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005985Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005986at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987
5988static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005989unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990{
5991 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5992 register const Py_UNICODE *e;
5993 int cased;
5994
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 /* Shortcut for single character strings */
5996 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005997 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005999 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006000 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006001 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006002
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 e = p + PyUnicode_GET_SIZE(self);
6004 cased = 0;
6005 for (; p < e; p++) {
6006 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006007
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006009 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 else if (!cased && Py_UNICODE_ISUPPER(ch))
6011 cased = 1;
6012 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006013 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014}
6015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006016PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006017"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006019Return True if S is a titlecased string and there is at least one\n\
6020character in S, i.e. upper- and titlecase characters may only\n\
6021follow uncased characters and lowercase characters only cased ones.\n\
6022Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
6024static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006025unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026{
6027 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6028 register const Py_UNICODE *e;
6029 int cased, previous_is_cased;
6030
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 /* Shortcut for single character strings */
6032 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006033 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6034 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006036 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006037 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006038 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006039
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 e = p + PyUnicode_GET_SIZE(self);
6041 cased = 0;
6042 previous_is_cased = 0;
6043 for (; p < e; p++) {
6044 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006045
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6047 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006048 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 previous_is_cased = 1;
6050 cased = 1;
6051 }
6052 else if (Py_UNICODE_ISLOWER(ch)) {
6053 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006054 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 previous_is_cased = 1;
6056 cased = 1;
6057 }
6058 else
6059 previous_is_cased = 0;
6060 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006061 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062}
6063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006064PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006065"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006067Return True if all characters in S are whitespace\n\
6068and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069
6070static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006071unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072{
6073 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6074 register const Py_UNICODE *e;
6075
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 /* Shortcut for single character strings */
6077 if (PyUnicode_GET_SIZE(self) == 1 &&
6078 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006079 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006081 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006082 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006083 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006084
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 e = p + PyUnicode_GET_SIZE(self);
6086 for (; p < e; p++) {
6087 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006088 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006090 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091}
6092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006093PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006094"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006095\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006096Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006097and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006098
6099static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006100unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006101{
6102 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6103 register const Py_UNICODE *e;
6104
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006105 /* Shortcut for single character strings */
6106 if (PyUnicode_GET_SIZE(self) == 1 &&
6107 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006108 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006109
6110 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006111 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006112 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006113
6114 e = p + PyUnicode_GET_SIZE(self);
6115 for (; p < e; p++) {
6116 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006117 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006118 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006119 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006120}
6121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006122PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006123"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006124\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006125Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006126and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006127
6128static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006129unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006130{
6131 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6132 register const Py_UNICODE *e;
6133
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006134 /* Shortcut for single character strings */
6135 if (PyUnicode_GET_SIZE(self) == 1 &&
6136 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006137 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006138
6139 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006140 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006141 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006142
6143 e = p + PyUnicode_GET_SIZE(self);
6144 for (; p < e; p++) {
6145 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006146 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006147 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006148 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006149}
6150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006151PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006152"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006154Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006155False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156
6157static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006158unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159{
6160 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6161 register const Py_UNICODE *e;
6162
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 /* Shortcut for single character strings */
6164 if (PyUnicode_GET_SIZE(self) == 1 &&
6165 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006166 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006168 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006169 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006170 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006171
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 e = p + PyUnicode_GET_SIZE(self);
6173 for (; p < e; p++) {
6174 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006175 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006177 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178}
6179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006180PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006181"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006183Return True if all characters in S are digits\n\
6184and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
6186static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006187unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188{
6189 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6190 register const Py_UNICODE *e;
6191
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 /* Shortcut for single character strings */
6193 if (PyUnicode_GET_SIZE(self) == 1 &&
6194 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006195 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006197 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006198 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006199 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006200
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 e = p + PyUnicode_GET_SIZE(self);
6202 for (; p < e; p++) {
6203 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006204 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006206 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207}
6208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006209PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006210"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006212Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006213False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214
6215static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006216unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217{
6218 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6219 register const Py_UNICODE *e;
6220
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 /* Shortcut for single character strings */
6222 if (PyUnicode_GET_SIZE(self) == 1 &&
6223 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006224 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006226 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006227 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006228 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006229
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 e = p + PyUnicode_GET_SIZE(self);
6231 for (; p < e; p++) {
6232 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006233 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006235 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236}
6237
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006238PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239"S.join(sequence) -> unicode\n\
6240\n\
6241Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006242sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243
6244static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006245unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006247 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248}
6249
Martin v. Löwis18e16552006-02-15 17:27:45 +00006250static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251unicode_length(PyUnicodeObject *self)
6252{
6253 return self->length;
6254}
6255
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006256PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006257"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258\n\
6259Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006260done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261
6262static PyObject *
6263unicode_ljust(PyUnicodeObject *self, PyObject *args)
6264{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006265 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006266 Py_UNICODE fillchar = ' ';
6267
Martin v. Löwis412fb672006-04-13 06:34:32 +00006268 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 return NULL;
6270
Tim Peters7a29bd52001-09-12 03:03:31 +00006271 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 Py_INCREF(self);
6273 return (PyObject*) self;
6274 }
6275
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006276 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277}
6278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006279PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280"S.lower() -> unicode\n\
6281\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006282Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283
6284static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006285unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 return fixup(self, fixlower);
6288}
6289
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006290#define LEFTSTRIP 0
6291#define RIGHTSTRIP 1
6292#define BOTHSTRIP 2
6293
6294/* Arrays indexed by above */
6295static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6296
6297#define STRIPNAME(i) (stripformat[i]+3)
6298
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006299/* externally visible for str.strip(unicode) */
6300PyObject *
6301_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6302{
6303 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006304 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006305 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006306 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6307 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006308
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006309 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6310
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006311 i = 0;
6312 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006313 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6314 i++;
6315 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006316 }
6317
6318 j = len;
6319 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006320 do {
6321 j--;
6322 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6323 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006324 }
6325
6326 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006327 Py_INCREF(self);
6328 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006329 }
6330 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006331 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006332}
6333
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334
6335static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006336do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006338 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006339 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006340
6341 i = 0;
6342 if (striptype != RIGHTSTRIP) {
6343 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6344 i++;
6345 }
6346 }
6347
6348 j = len;
6349 if (striptype != LEFTSTRIP) {
6350 do {
6351 j--;
6352 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6353 j++;
6354 }
6355
6356 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6357 Py_INCREF(self);
6358 return (PyObject*)self;
6359 }
6360 else
6361 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362}
6363
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006364
6365static PyObject *
6366do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6367{
6368 PyObject *sep = NULL;
6369
6370 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6371 return NULL;
6372
6373 if (sep != NULL && sep != Py_None) {
6374 if (PyUnicode_Check(sep))
6375 return _PyUnicode_XStrip(self, striptype, sep);
6376 else if (PyString_Check(sep)) {
6377 PyObject *res;
6378 sep = PyUnicode_FromObject(sep);
6379 if (sep==NULL)
6380 return NULL;
6381 res = _PyUnicode_XStrip(self, striptype, sep);
6382 Py_DECREF(sep);
6383 return res;
6384 }
6385 else {
6386 PyErr_Format(PyExc_TypeError,
6387 "%s arg must be None, unicode or str",
6388 STRIPNAME(striptype));
6389 return NULL;
6390 }
6391 }
6392
6393 return do_strip(self, striptype);
6394}
6395
6396
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006397PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006398"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006399\n\
6400Return a copy of the string S with leading and trailing\n\
6401whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006402If chars is given and not None, remove characters in chars instead.\n\
6403If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006404
6405static PyObject *
6406unicode_strip(PyUnicodeObject *self, PyObject *args)
6407{
6408 if (PyTuple_GET_SIZE(args) == 0)
6409 return do_strip(self, BOTHSTRIP); /* Common case */
6410 else
6411 return do_argstrip(self, BOTHSTRIP, args);
6412}
6413
6414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006415PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006416"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006417\n\
6418Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006419If chars is given and not None, remove characters in chars instead.\n\
6420If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006421
6422static PyObject *
6423unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6424{
6425 if (PyTuple_GET_SIZE(args) == 0)
6426 return do_strip(self, LEFTSTRIP); /* Common case */
6427 else
6428 return do_argstrip(self, LEFTSTRIP, args);
6429}
6430
6431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006432PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006433"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006434\n\
6435Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006436If chars is given and not None, remove characters in chars instead.\n\
6437If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006438
6439static PyObject *
6440unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6441{
6442 if (PyTuple_GET_SIZE(args) == 0)
6443 return do_strip(self, RIGHTSTRIP); /* Common case */
6444 else
6445 return do_argstrip(self, RIGHTSTRIP, args);
6446}
6447
6448
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006450unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451{
6452 PyUnicodeObject *u;
6453 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006454 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006455 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456
6457 if (len < 0)
6458 len = 0;
6459
Tim Peters7a29bd52001-09-12 03:03:31 +00006460 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 /* no repeat, return original string */
6462 Py_INCREF(str);
6463 return (PyObject*) str;
6464 }
Tim Peters8f422462000-09-09 06:13:41 +00006465
6466 /* ensure # of chars needed doesn't overflow int and # of bytes
6467 * needed doesn't overflow size_t
6468 */
6469 nchars = len * str->length;
6470 if (len && nchars / len != str->length) {
6471 PyErr_SetString(PyExc_OverflowError,
6472 "repeated string is too long");
6473 return NULL;
6474 }
6475 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6476 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6477 PyErr_SetString(PyExc_OverflowError,
6478 "repeated string is too long");
6479 return NULL;
6480 }
6481 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 if (!u)
6483 return NULL;
6484
6485 p = u->str;
6486
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006487 if (str->length == 1 && len > 0) {
6488 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006489 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006490 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006491 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006492 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006493 done = str->length;
6494 }
6495 while (done < nchars) {
6496 int n = (done <= nchars-done) ? done : nchars-done;
6497 Py_UNICODE_COPY(p+done, p, n);
6498 done += n;
6499 }
6500 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501
6502 return (PyObject*) u;
6503}
6504
6505PyObject *PyUnicode_Replace(PyObject *obj,
6506 PyObject *subobj,
6507 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006508 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509{
6510 PyObject *self;
6511 PyObject *str1;
6512 PyObject *str2;
6513 PyObject *result;
6514
6515 self = PyUnicode_FromObject(obj);
6516 if (self == NULL)
6517 return NULL;
6518 str1 = PyUnicode_FromObject(subobj);
6519 if (str1 == NULL) {
6520 Py_DECREF(self);
6521 return NULL;
6522 }
6523 str2 = PyUnicode_FromObject(replobj);
6524 if (str2 == NULL) {
6525 Py_DECREF(self);
6526 Py_DECREF(str1);
6527 return NULL;
6528 }
Tim Petersced69f82003-09-16 20:30:58 +00006529 result = replace((PyUnicodeObject *)self,
6530 (PyUnicodeObject *)str1,
6531 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 maxcount);
6533 Py_DECREF(self);
6534 Py_DECREF(str1);
6535 Py_DECREF(str2);
6536 return result;
6537}
6538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006539PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540"S.replace (old, new[, maxsplit]) -> unicode\n\
6541\n\
6542Return a copy of S with all occurrences of substring\n\
6543old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006544given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545
6546static PyObject*
6547unicode_replace(PyUnicodeObject *self, PyObject *args)
6548{
6549 PyUnicodeObject *str1;
6550 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006551 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 PyObject *result;
6553
Martin v. Löwis18e16552006-02-15 17:27:45 +00006554 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 return NULL;
6556 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6557 if (str1 == NULL)
6558 return NULL;
6559 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006560 if (str2 == NULL) {
6561 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564
6565 result = replace(self, str1, str2, maxcount);
6566
6567 Py_DECREF(str1);
6568 Py_DECREF(str2);
6569 return result;
6570}
6571
6572static
6573PyObject *unicode_repr(PyObject *unicode)
6574{
6575 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6576 PyUnicode_GET_SIZE(unicode),
6577 1);
6578}
6579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006580PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581"S.rfind(sub [,start [,end]]) -> int\n\
6582\n\
6583Return the highest index in S where substring sub is found,\n\
Georg Brandlb4d100c2007-07-29 17:37:22 +00006584such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585arguments start and end are interpreted as in slice notation.\n\
6586\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006587Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588
6589static PyObject *
6590unicode_rfind(PyUnicodeObject *self, PyObject *args)
6591{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006592 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006593 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006594 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006595 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
Guido van Rossumb8872e62000-05-09 14:14:27 +00006597 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6598 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006600 substring = PyUnicode_FromObject(substring);
6601 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 return NULL;
6603
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006604 result = stringlib_rfind_slice(
6605 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6606 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6607 start, end
6608 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609
6610 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006611
6612 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613}
6614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006615PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616"S.rindex(sub [,start [,end]]) -> int\n\
6617\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006618Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619
6620static PyObject *
6621unicode_rindex(PyUnicodeObject *self, PyObject *args)
6622{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006623 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006624 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006625 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006626 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627
Guido van Rossumb8872e62000-05-09 14:14:27 +00006628 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6629 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006631 substring = PyUnicode_FromObject(substring);
6632 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 return NULL;
6634
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006635 result = stringlib_rfind_slice(
6636 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6637 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6638 start, end
6639 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640
6641 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006642
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 if (result < 0) {
6644 PyErr_SetString(PyExc_ValueError, "substring not found");
6645 return NULL;
6646 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006647 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648}
6649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006650PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006651"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652\n\
6653Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006654done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655
6656static PyObject *
6657unicode_rjust(PyUnicodeObject *self, PyObject *args)
6658{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006659 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006660 Py_UNICODE fillchar = ' ';
6661
Martin v. Löwis412fb672006-04-13 06:34:32 +00006662 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 return NULL;
6664
Tim Peters7a29bd52001-09-12 03:03:31 +00006665 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 Py_INCREF(self);
6667 return (PyObject*) self;
6668 }
6669
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006670 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671}
6672
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006674unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675{
6676 /* standard clamping */
6677 if (start < 0)
6678 start = 0;
6679 if (end < 0)
6680 end = 0;
6681 if (end > self->length)
6682 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006683 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 /* full slice, return original string */
6685 Py_INCREF(self);
6686 return (PyObject*) self;
6687 }
6688 if (start > end)
6689 start = end;
6690 /* copy slice */
6691 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6692 end - start);
6693}
6694
6695PyObject *PyUnicode_Split(PyObject *s,
6696 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006697 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698{
6699 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 s = PyUnicode_FromObject(s);
6702 if (s == NULL)
6703 return NULL;
6704 if (sep != NULL) {
6705 sep = PyUnicode_FromObject(sep);
6706 if (sep == NULL) {
6707 Py_DECREF(s);
6708 return NULL;
6709 }
6710 }
6711
6712 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6713
6714 Py_DECREF(s);
6715 Py_XDECREF(sep);
6716 return result;
6717}
6718
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006719PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720"S.split([sep [,maxsplit]]) -> list of strings\n\
6721\n\
6722Return a list of the words in S, using sep as the\n\
6723delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006724splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006725any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726
6727static PyObject*
6728unicode_split(PyUnicodeObject *self, PyObject *args)
6729{
6730 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006731 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732
Martin v. Löwis18e16552006-02-15 17:27:45 +00006733 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 return NULL;
6735
6736 if (substring == Py_None)
6737 return split(self, NULL, maxcount);
6738 else if (PyUnicode_Check(substring))
6739 return split(self, (PyUnicodeObject *)substring, maxcount);
6740 else
6741 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6742}
6743
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006744PyObject *
6745PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6746{
6747 PyObject* str_obj;
6748 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006749 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006750
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006751 str_obj = PyUnicode_FromObject(str_in);
6752 if (!str_obj)
6753 return NULL;
6754 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006755 if (!sep_obj) {
6756 Py_DECREF(str_obj);
6757 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006758 }
6759
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006760 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006761 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6762 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6763 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006764
Fredrik Lundhb9479482006-05-26 17:22:38 +00006765 Py_DECREF(sep_obj);
6766 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006767
6768 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006769}
6770
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006771
6772PyObject *
6773PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6774{
6775 PyObject* str_obj;
6776 PyObject* sep_obj;
6777 PyObject* out;
6778
6779 str_obj = PyUnicode_FromObject(str_in);
6780 if (!str_obj)
6781 return NULL;
6782 sep_obj = PyUnicode_FromObject(sep_in);
6783 if (!sep_obj) {
6784 Py_DECREF(str_obj);
6785 return NULL;
6786 }
6787
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006788 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006789 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6790 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6791 );
6792
6793 Py_DECREF(sep_obj);
6794 Py_DECREF(str_obj);
6795
6796 return out;
6797}
6798
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006799PyDoc_STRVAR(partition__doc__,
6800"S.partition(sep) -> (head, sep, tail)\n\
6801\n\
6802Searches for the separator sep in S, and returns the part before it,\n\
6803the separator itself, and the part after it. If the separator is not\n\
6804found, returns S and two empty strings.");
6805
6806static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006807unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006808{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006809 return PyUnicode_Partition((PyObject *)self, separator);
6810}
6811
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006812PyDoc_STRVAR(rpartition__doc__,
Neal Norwitz29a5fdb2006-09-05 02:21:38 +00006813"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006814\n\
6815Searches for the separator sep in S, starting at the end of S, and returns\n\
6816the part before it, the separator itself, and the part after it. If the\n\
Neal Norwitz29a5fdb2006-09-05 02:21:38 +00006817separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006818
6819static PyObject*
6820unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6821{
6822 return PyUnicode_RPartition((PyObject *)self, separator);
6823}
6824
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006825PyObject *PyUnicode_RSplit(PyObject *s,
6826 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006827 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006828{
6829 PyObject *result;
6830
6831 s = PyUnicode_FromObject(s);
6832 if (s == NULL)
6833 return NULL;
6834 if (sep != NULL) {
6835 sep = PyUnicode_FromObject(sep);
6836 if (sep == NULL) {
6837 Py_DECREF(s);
6838 return NULL;
6839 }
6840 }
6841
6842 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6843
6844 Py_DECREF(s);
6845 Py_XDECREF(sep);
6846 return result;
6847}
6848
6849PyDoc_STRVAR(rsplit__doc__,
6850"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6851\n\
6852Return a list of the words in S, using sep as the\n\
6853delimiter string, starting at the end of the string and\n\
6854working to the front. If maxsplit is given, at most maxsplit\n\
6855splits are done. If sep is not specified, any whitespace string\n\
6856is a separator.");
6857
6858static PyObject*
6859unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6860{
6861 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006862 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006863
Martin v. Löwis18e16552006-02-15 17:27:45 +00006864 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006865 return NULL;
6866
6867 if (substring == Py_None)
6868 return rsplit(self, NULL, maxcount);
6869 else if (PyUnicode_Check(substring))
6870 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6871 else
6872 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6873}
6874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006875PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006876"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877\n\
6878Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006879Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006880is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881
6882static PyObject*
6883unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6884{
Guido van Rossum86662912000-04-11 15:38:46 +00006885 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886
Guido van Rossum86662912000-04-11 15:38:46 +00006887 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 return NULL;
6889
Guido van Rossum86662912000-04-11 15:38:46 +00006890 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891}
6892
6893static
6894PyObject *unicode_str(PyUnicodeObject *self)
6895{
Fred Drakee4315f52000-05-09 19:53:39 +00006896 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897}
6898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006899PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900"S.swapcase() -> unicode\n\
6901\n\
6902Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006903and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904
6905static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006906unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908 return fixup(self, fixswapcase);
6909}
6910
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006911PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912"S.translate(table) -> unicode\n\
6913\n\
6914Return a copy of the string S, where all characters have been mapped\n\
6915through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006916Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6917Unmapped characters are left untouched. Characters mapped to None\n\
6918are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919
6920static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006921unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922{
Tim Petersced69f82003-09-16 20:30:58 +00006923 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006925 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 "ignore");
6927}
6928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006929PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930"S.upper() -> unicode\n\
6931\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006932Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933
6934static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006935unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 return fixup(self, fixupper);
6938}
6939
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006940PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941"S.zfill(width) -> unicode\n\
6942\n\
6943Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006944of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945
6946static PyObject *
6947unicode_zfill(PyUnicodeObject *self, PyObject *args)
6948{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006949 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 PyUnicodeObject *u;
6951
Martin v. Löwis18e16552006-02-15 17:27:45 +00006952 Py_ssize_t width;
6953 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 return NULL;
6955
6956 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006957 if (PyUnicode_CheckExact(self)) {
6958 Py_INCREF(self);
6959 return (PyObject*) self;
6960 }
6961 else
6962 return PyUnicode_FromUnicode(
6963 PyUnicode_AS_UNICODE(self),
6964 PyUnicode_GET_SIZE(self)
6965 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 }
6967
6968 fill = width - self->length;
6969
6970 u = pad(self, fill, 0, '0');
6971
Walter Dörwald068325e2002-04-15 13:36:47 +00006972 if (u == NULL)
6973 return NULL;
6974
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975 if (u->str[fill] == '+' || u->str[fill] == '-') {
6976 /* move sign to beginning of string */
6977 u->str[0] = u->str[fill];
6978 u->str[fill] = '0';
6979 }
6980
6981 return (PyObject*) u;
6982}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983
6984#if 0
6985static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006986unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988 return PyInt_FromLong(unicode_freelist_size);
6989}
6990#endif
6991
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006992PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006993"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006995Return True if S starts with the specified prefix, False otherwise.\n\
6996With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00006997With optional end, stop comparing S at that position.\n\
6998prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999
7000static PyObject *
7001unicode_startswith(PyUnicodeObject *self,
7002 PyObject *args)
7003{
Georg Brandl24250812006-06-09 18:45:48 +00007004 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007006 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007007 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007008 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009
Georg Brandl24250812006-06-09 18:45:48 +00007010 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007011 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007013 if (PyTuple_Check(subobj)) {
7014 Py_ssize_t i;
7015 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7016 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7017 PyTuple_GET_ITEM(subobj, i));
7018 if (substring == NULL)
7019 return NULL;
7020 result = tailmatch(self, substring, start, end, -1);
7021 Py_DECREF(substring);
7022 if (result) {
7023 Py_RETURN_TRUE;
7024 }
7025 }
7026 /* nothing matched */
7027 Py_RETURN_FALSE;
7028 }
7029 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007031 return NULL;
7032 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007034 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035}
7036
7037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007038PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007039"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007041Return True if S ends with the specified suffix, False otherwise.\n\
7042With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007043With optional end, stop comparing S at that position.\n\
7044suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045
7046static PyObject *
7047unicode_endswith(PyUnicodeObject *self,
7048 PyObject *args)
7049{
Georg Brandl24250812006-06-09 18:45:48 +00007050 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007052 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007053 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007054 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055
Georg Brandl24250812006-06-09 18:45:48 +00007056 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7057 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007059 if (PyTuple_Check(subobj)) {
7060 Py_ssize_t i;
7061 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7062 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7063 PyTuple_GET_ITEM(subobj, i));
7064 if (substring == NULL)
7065 return NULL;
7066 result = tailmatch(self, substring, start, end, +1);
7067 Py_DECREF(substring);
7068 if (result) {
7069 Py_RETURN_TRUE;
7070 }
7071 }
7072 Py_RETURN_FALSE;
7073 }
7074 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007076 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077
Georg Brandl24250812006-06-09 18:45:48 +00007078 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007080 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081}
7082
7083
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007084
7085static PyObject *
7086unicode_getnewargs(PyUnicodeObject *v)
7087{
7088 return Py_BuildValue("(u#)", v->str, v->length);
7089}
7090
7091
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092static PyMethodDef unicode_methods[] = {
7093
7094 /* Order is according to common usage: often used methods should
7095 appear first, since lookup is done sequentially. */
7096
Georg Brandlecdc0a92006-03-30 12:19:07 +00007097 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007098 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7099 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007100 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007101 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7102 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7103 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7104 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7105 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7106 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7107 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007108 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007109 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7110 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7111 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007112 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007113 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007114/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7115 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7116 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7117 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007118 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007119 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007120 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007121 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007122 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7123 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7124 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7125 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7126 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7127 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7128 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7129 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7130 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7131 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7132 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7133 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7134 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7135 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007136 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007137#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007138 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139#endif
7140
7141#if 0
7142 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007143 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144#endif
7145
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007146 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 {NULL, NULL}
7148};
7149
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007150static PyObject *
7151unicode_mod(PyObject *v, PyObject *w)
7152{
7153 if (!PyUnicode_Check(v)) {
7154 Py_INCREF(Py_NotImplemented);
7155 return Py_NotImplemented;
7156 }
7157 return PyUnicode_Format(v, w);
7158}
7159
7160static PyNumberMethods unicode_as_number = {
7161 0, /*nb_add*/
7162 0, /*nb_subtract*/
7163 0, /*nb_multiply*/
7164 0, /*nb_divide*/
7165 unicode_mod, /*nb_remainder*/
7166};
7167
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007169 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007170 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007171 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7172 (ssizeargfunc) unicode_getitem, /* sq_item */
7173 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 0, /* sq_ass_item */
7175 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007176 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177};
7178
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007179static PyObject*
7180unicode_subscript(PyUnicodeObject* self, PyObject* item)
7181{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007182 if (PyIndex_Check(item)) {
7183 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007184 if (i == -1 && PyErr_Occurred())
7185 return NULL;
7186 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007187 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007188 return unicode_getitem(self, i);
7189 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007190 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007191 Py_UNICODE* source_buf;
7192 Py_UNICODE* result_buf;
7193 PyObject* result;
7194
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007195 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007196 &start, &stop, &step, &slicelength) < 0) {
7197 return NULL;
7198 }
7199
7200 if (slicelength <= 0) {
7201 return PyUnicode_FromUnicode(NULL, 0);
7202 } else {
7203 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitzb3635f92008-03-18 04:17:36 +00007204 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7205 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007206
7207 if (result_buf == NULL)
7208 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007209
7210 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7211 result_buf[i] = source_buf[cur];
7212 }
Tim Petersced69f82003-09-16 20:30:58 +00007213
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007214 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitzb3635f92008-03-18 04:17:36 +00007215 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007216 return result;
7217 }
7218 } else {
7219 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7220 return NULL;
7221 }
7222}
7223
7224static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007225 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007226 (binaryfunc)unicode_subscript, /* mp_subscript */
7227 (objobjargproc)0, /* mp_ass_subscript */
7228};
7229
Martin v. Löwis18e16552006-02-15 17:27:45 +00007230static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007232 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233 const void **ptr)
7234{
7235 if (index != 0) {
7236 PyErr_SetString(PyExc_SystemError,
7237 "accessing non-existent unicode segment");
7238 return -1;
7239 }
7240 *ptr = (void *) self->str;
7241 return PyUnicode_GET_DATA_SIZE(self);
7242}
7243
Martin v. Löwis18e16552006-02-15 17:27:45 +00007244static Py_ssize_t
7245unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246 const void **ptr)
7247{
7248 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007249 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 return -1;
7251}
7252
7253static int
7254unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007255 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256{
7257 if (lenp)
7258 *lenp = PyUnicode_GET_DATA_SIZE(self);
7259 return 1;
7260}
7261
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007262static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007264 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265 const void **ptr)
7266{
7267 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007268
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269 if (index != 0) {
7270 PyErr_SetString(PyExc_SystemError,
7271 "accessing non-existent unicode segment");
7272 return -1;
7273 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007274 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 if (str == NULL)
7276 return -1;
7277 *ptr = (void *) PyString_AS_STRING(str);
7278 return PyString_GET_SIZE(str);
7279}
7280
7281/* Helpers for PyUnicode_Format() */
7282
7283static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007284getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007286 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287 if (argidx < arglen) {
7288 (*p_argidx)++;
7289 if (arglen < 0)
7290 return args;
7291 else
7292 return PyTuple_GetItem(args, argidx);
7293 }
7294 PyErr_SetString(PyExc_TypeError,
7295 "not enough arguments for format string");
7296 return NULL;
7297}
7298
7299#define F_LJUST (1<<0)
7300#define F_SIGN (1<<1)
7301#define F_BLANK (1<<2)
7302#define F_ALT (1<<3)
7303#define F_ZERO (1<<4)
7304
Martin v. Löwis18e16552006-02-15 17:27:45 +00007305static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007306strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007308 register Py_ssize_t i;
7309 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 for (i = len - 1; i >= 0; i--)
7311 buffer[i] = (Py_UNICODE) charbuffer[i];
7312
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 return len;
7314}
7315
Neal Norwitzfc76d632006-01-10 06:03:13 +00007316static int
7317doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7318{
Tim Peters15231542006-02-16 01:08:01 +00007319 Py_ssize_t result;
7320
Neal Norwitzfc76d632006-01-10 06:03:13 +00007321 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007322 result = strtounicode(buffer, (char *)buffer);
7323 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007324}
7325
7326static int
7327longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7328{
Tim Peters15231542006-02-16 01:08:01 +00007329 Py_ssize_t result;
7330
Neal Norwitzfc76d632006-01-10 06:03:13 +00007331 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007332 result = strtounicode(buffer, (char *)buffer);
7333 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007334}
7335
Guido van Rossum078151d2002-08-11 04:24:12 +00007336/* XXX To save some code duplication, formatfloat/long/int could have been
7337 shared with stringobject.c, converting from 8-bit to Unicode after the
7338 formatting is done. */
7339
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340static int
7341formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007342 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343 int flags,
7344 int prec,
7345 int type,
7346 PyObject *v)
7347{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007348 /* fmt = '%#.' + `prec` + `type`
7349 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 char fmt[20];
7351 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007352
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353 x = PyFloat_AsDouble(v);
7354 if (x == -1.0 && PyErr_Occurred())
7355 return -1;
7356 if (prec < 0)
7357 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7359 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007360 /* Worst case length calc to ensure no buffer overrun:
7361
7362 'g' formats:
7363 fmt = %#.<prec>g
7364 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7365 for any double rep.)
7366 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7367
7368 'f' formats:
7369 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7370 len = 1 + 50 + 1 + prec = 52 + prec
7371
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007372 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007373 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007374
7375 */
Georg Brandlc5db9232007-07-12 08:38:04 +00007376 if (((type == 'g' || type == 'G') &&
7377 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007378 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007379 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007380 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007381 return -1;
7382 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007383 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7384 (flags&F_ALT) ? "#" : "",
7385 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007386 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387}
7388
Tim Peters38fd5b62000-09-21 05:43:11 +00007389static PyObject*
7390formatlong(PyObject *val, int flags, int prec, int type)
7391{
7392 char *buf;
7393 int i, len;
7394 PyObject *str; /* temporary string object. */
7395 PyUnicodeObject *result;
7396
7397 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7398 if (!str)
7399 return NULL;
7400 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007401 if (!result) {
7402 Py_DECREF(str);
7403 return NULL;
7404 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007405 for (i = 0; i < len; i++)
7406 result->str[i] = buf[i];
7407 result->str[len] = 0;
7408 Py_DECREF(str);
7409 return (PyObject*)result;
7410}
7411
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412static int
7413formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007414 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415 int flags,
7416 int prec,
7417 int type,
7418 PyObject *v)
7419{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007420 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007421 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7422 * + 1 + 1
7423 * = 24
7424 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007425 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007426 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427 long x;
7428
7429 x = PyInt_AsLong(v);
7430 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007431 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007432 if (x < 0 && type == 'u') {
7433 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007434 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007435 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7436 sign = "-";
7437 else
7438 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007440 prec = 1;
7441
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007442 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7443 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007444 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007445 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007446 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007447 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007448 return -1;
7449 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007450
7451 if ((flags & F_ALT) &&
7452 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007453 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007454 * of issues that cause pain:
7455 * - when 0 is being converted, the C standard leaves off
7456 * the '0x' or '0X', which is inconsistent with other
7457 * %#x/%#X conversions and inconsistent with Python's
7458 * hex() function
7459 * - there are platforms that violate the standard and
7460 * convert 0 with the '0x' or '0X'
7461 * (Metrowerks, Compaq Tru64)
7462 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007463 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007464 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007465 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007466 * We can achieve the desired consistency by inserting our
7467 * own '0x' or '0X' prefix, and substituting %x/%X in place
7468 * of %#x/%#X.
7469 *
7470 * Note that this is the same approach as used in
7471 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007472 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007473 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7474 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007475 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007476 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007477 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7478 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007479 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007480 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007481 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007482 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007483 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007484 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485}
7486
7487static int
7488formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007489 size_t buflen,
7490 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007492 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007493 if (PyUnicode_Check(v)) {
7494 if (PyUnicode_GET_SIZE(v) != 1)
7495 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007499 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007500 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007501 goto onError;
7502 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504
7505 else {
7506 /* Integer input truncated to a character */
7507 long x;
7508 x = PyInt_AsLong(v);
7509 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007510 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007511#ifdef Py_UNICODE_WIDE
7512 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007513 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007514 "%c arg not in range(0x110000) "
7515 "(wide Python build)");
7516 return -1;
7517 }
7518#else
7519 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007520 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007521 "%c arg not in range(0x10000) "
7522 "(narrow Python build)");
7523 return -1;
7524 }
7525#endif
7526 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527 }
7528 buf[1] = '\0';
7529 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007530
7531 onError:
7532 PyErr_SetString(PyExc_TypeError,
7533 "%c requires int or char");
7534 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535}
7536
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007537/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7538
7539 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7540 chars are formatted. XXX This is a magic number. Each formatting
7541 routine does bounds checking to ensure no overflow, but a better
7542 solution may be to malloc a buffer of appropriate size for each
7543 format. For now, the current solution is sufficient.
7544*/
7545#define FORMATBUFLEN (size_t)120
7546
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547PyObject *PyUnicode_Format(PyObject *format,
7548 PyObject *args)
7549{
7550 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007551 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 int args_owned = 0;
7553 PyUnicodeObject *result = NULL;
7554 PyObject *dict = NULL;
7555 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007556
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557 if (format == NULL || args == NULL) {
7558 PyErr_BadInternalCall();
7559 return NULL;
7560 }
7561 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007562 if (uformat == NULL)
7563 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 fmt = PyUnicode_AS_UNICODE(uformat);
7565 fmtcnt = PyUnicode_GET_SIZE(uformat);
7566
7567 reslen = rescnt = fmtcnt + 100;
7568 result = _PyUnicode_New(reslen);
7569 if (result == NULL)
7570 goto onError;
7571 res = PyUnicode_AS_UNICODE(result);
7572
7573 if (PyTuple_Check(args)) {
7574 arglen = PyTuple_Size(args);
7575 argidx = 0;
7576 }
7577 else {
7578 arglen = -1;
7579 argidx = -2;
7580 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007581 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7582 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 dict = args;
7584
7585 while (--fmtcnt >= 0) {
7586 if (*fmt != '%') {
7587 if (--rescnt < 0) {
7588 rescnt = fmtcnt + 100;
7589 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007590 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007591 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7593 --rescnt;
7594 }
7595 *res++ = *fmt++;
7596 }
7597 else {
7598 /* Got a format specifier */
7599 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007600 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602 Py_UNICODE c = '\0';
7603 Py_UNICODE fill;
7604 PyObject *v = NULL;
7605 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007606 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007608 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007609 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610
7611 fmt++;
7612 if (*fmt == '(') {
7613 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007614 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615 PyObject *key;
7616 int pcount = 1;
7617
7618 if (dict == NULL) {
7619 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007620 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621 goto onError;
7622 }
7623 ++fmt;
7624 --fmtcnt;
7625 keystart = fmt;
7626 /* Skip over balanced parentheses */
7627 while (pcount > 0 && --fmtcnt >= 0) {
7628 if (*fmt == ')')
7629 --pcount;
7630 else if (*fmt == '(')
7631 ++pcount;
7632 fmt++;
7633 }
7634 keylen = fmt - keystart - 1;
7635 if (fmtcnt < 0 || pcount > 0) {
7636 PyErr_SetString(PyExc_ValueError,
7637 "incomplete format key");
7638 goto onError;
7639 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007640#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007641 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642 then looked up since Python uses strings to hold
7643 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007644 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645 key = PyUnicode_EncodeUTF8(keystart,
7646 keylen,
7647 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007648#else
7649 key = PyUnicode_FromUnicode(keystart, keylen);
7650#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651 if (key == NULL)
7652 goto onError;
7653 if (args_owned) {
7654 Py_DECREF(args);
7655 args_owned = 0;
7656 }
7657 args = PyObject_GetItem(dict, key);
7658 Py_DECREF(key);
7659 if (args == NULL) {
7660 goto onError;
7661 }
7662 args_owned = 1;
7663 arglen = -1;
7664 argidx = -2;
7665 }
7666 while (--fmtcnt >= 0) {
7667 switch (c = *fmt++) {
7668 case '-': flags |= F_LJUST; continue;
7669 case '+': flags |= F_SIGN; continue;
7670 case ' ': flags |= F_BLANK; continue;
7671 case '#': flags |= F_ALT; continue;
7672 case '0': flags |= F_ZERO; continue;
7673 }
7674 break;
7675 }
7676 if (c == '*') {
7677 v = getnextarg(args, arglen, &argidx);
7678 if (v == NULL)
7679 goto onError;
7680 if (!PyInt_Check(v)) {
7681 PyErr_SetString(PyExc_TypeError,
7682 "* wants int");
7683 goto onError;
7684 }
7685 width = PyInt_AsLong(v);
7686 if (width < 0) {
7687 flags |= F_LJUST;
7688 width = -width;
7689 }
7690 if (--fmtcnt >= 0)
7691 c = *fmt++;
7692 }
7693 else if (c >= '0' && c <= '9') {
7694 width = c - '0';
7695 while (--fmtcnt >= 0) {
7696 c = *fmt++;
7697 if (c < '0' || c > '9')
7698 break;
7699 if ((width*10) / 10 != width) {
7700 PyErr_SetString(PyExc_ValueError,
7701 "width too big");
7702 goto onError;
7703 }
7704 width = width*10 + (c - '0');
7705 }
7706 }
7707 if (c == '.') {
7708 prec = 0;
7709 if (--fmtcnt >= 0)
7710 c = *fmt++;
7711 if (c == '*') {
7712 v = getnextarg(args, arglen, &argidx);
7713 if (v == NULL)
7714 goto onError;
7715 if (!PyInt_Check(v)) {
7716 PyErr_SetString(PyExc_TypeError,
7717 "* wants int");
7718 goto onError;
7719 }
7720 prec = PyInt_AsLong(v);
7721 if (prec < 0)
7722 prec = 0;
7723 if (--fmtcnt >= 0)
7724 c = *fmt++;
7725 }
7726 else if (c >= '0' && c <= '9') {
7727 prec = c - '0';
7728 while (--fmtcnt >= 0) {
7729 c = Py_CHARMASK(*fmt++);
7730 if (c < '0' || c > '9')
7731 break;
7732 if ((prec*10) / 10 != prec) {
7733 PyErr_SetString(PyExc_ValueError,
7734 "prec too big");
7735 goto onError;
7736 }
7737 prec = prec*10 + (c - '0');
7738 }
7739 }
7740 } /* prec */
7741 if (fmtcnt >= 0) {
7742 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 if (--fmtcnt >= 0)
7744 c = *fmt++;
7745 }
7746 }
7747 if (fmtcnt < 0) {
7748 PyErr_SetString(PyExc_ValueError,
7749 "incomplete format");
7750 goto onError;
7751 }
7752 if (c != '%') {
7753 v = getnextarg(args, arglen, &argidx);
7754 if (v == NULL)
7755 goto onError;
7756 }
7757 sign = 0;
7758 fill = ' ';
7759 switch (c) {
7760
7761 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007762 pbuf = formatbuf;
7763 /* presume that buffer length is at least 1 */
7764 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765 len = 1;
7766 break;
7767
7768 case 's':
7769 case 'r':
7770 if (PyUnicode_Check(v) && c == 's') {
7771 temp = v;
7772 Py_INCREF(temp);
7773 }
7774 else {
7775 PyObject *unicode;
7776 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007777 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778 else
7779 temp = PyObject_Repr(v);
7780 if (temp == NULL)
7781 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007782 if (PyUnicode_Check(temp))
7783 /* nothing to do */;
7784 else if (PyString_Check(temp)) {
7785 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007786 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007788 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007790 Py_DECREF(temp);
7791 temp = unicode;
7792 if (temp == NULL)
7793 goto onError;
7794 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007795 else {
7796 Py_DECREF(temp);
7797 PyErr_SetString(PyExc_TypeError,
7798 "%s argument has non-string str()");
7799 goto onError;
7800 }
7801 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007802 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803 len = PyUnicode_GET_SIZE(temp);
7804 if (prec >= 0 && len > prec)
7805 len = prec;
7806 break;
7807
7808 case 'i':
7809 case 'd':
7810 case 'u':
7811 case 'o':
7812 case 'x':
7813 case 'X':
7814 if (c == 'i')
7815 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007816 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007817 temp = formatlong(v, flags, prec, c);
7818 if (!temp)
7819 goto onError;
7820 pbuf = PyUnicode_AS_UNICODE(temp);
7821 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007822 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007824 else {
7825 pbuf = formatbuf;
7826 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7827 flags, prec, c, v);
7828 if (len < 0)
7829 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007830 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007831 }
7832 if (flags & F_ZERO)
7833 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 break;
7835
7836 case 'e':
7837 case 'E':
7838 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007839 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840 case 'g':
7841 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007842 if (c == 'F')
7843 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007844 pbuf = formatbuf;
7845 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7846 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847 if (len < 0)
7848 goto onError;
7849 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007850 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851 fill = '0';
7852 break;
7853
7854 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007855 pbuf = formatbuf;
7856 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857 if (len < 0)
7858 goto onError;
7859 break;
7860
7861 default:
7862 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007863 "unsupported format character '%c' (0x%x) "
Armin Rigo4b63c212006-10-04 11:44:06 +00007864 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00007865 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007866 (int)c,
Armin Rigo4b63c212006-10-04 11:44:06 +00007867 (Py_ssize_t)(fmt - 1 -
7868 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869 goto onError;
7870 }
7871 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007872 if (*pbuf == '-' || *pbuf == '+') {
7873 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 len--;
7875 }
7876 else if (flags & F_SIGN)
7877 sign = '+';
7878 else if (flags & F_BLANK)
7879 sign = ' ';
7880 else
7881 sign = 0;
7882 }
7883 if (width < len)
7884 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007885 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886 reslen -= rescnt;
7887 rescnt = width + fmtcnt + 100;
7888 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007889 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007890 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007891 PyErr_NoMemory();
7892 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007893 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007894 if (_PyUnicode_Resize(&result, reslen) < 0) {
7895 Py_XDECREF(temp);
7896 goto onError;
7897 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898 res = PyUnicode_AS_UNICODE(result)
7899 + reslen - rescnt;
7900 }
7901 if (sign) {
7902 if (fill != ' ')
7903 *res++ = sign;
7904 rescnt--;
7905 if (width > len)
7906 width--;
7907 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007908 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7909 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007910 assert(pbuf[1] == c);
7911 if (fill != ' ') {
7912 *res++ = *pbuf++;
7913 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007914 }
Tim Petersfff53252001-04-12 18:38:48 +00007915 rescnt -= 2;
7916 width -= 2;
7917 if (width < 0)
7918 width = 0;
7919 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007920 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921 if (width > len && !(flags & F_LJUST)) {
7922 do {
7923 --rescnt;
7924 *res++ = fill;
7925 } while (--width > len);
7926 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007927 if (fill == ' ') {
7928 if (sign)
7929 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007930 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007931 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007932 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007933 *res++ = *pbuf++;
7934 *res++ = *pbuf++;
7935 }
7936 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007937 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938 res += len;
7939 rescnt -= len;
7940 while (--width >= len) {
7941 --rescnt;
7942 *res++ = ' ';
7943 }
7944 if (dict && (argidx < arglen) && c != '%') {
7945 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007946 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007947 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948 goto onError;
7949 }
7950 Py_XDECREF(temp);
7951 } /* '%' */
7952 } /* until end */
7953 if (argidx < arglen && !dict) {
7954 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007955 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956 goto onError;
7957 }
7958
Thomas Woutersa96affe2006-03-12 00:29:36 +00007959 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7960 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961 if (args_owned) {
7962 Py_DECREF(args);
7963 }
7964 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965 return (PyObject *)result;
7966
7967 onError:
7968 Py_XDECREF(result);
7969 Py_DECREF(uformat);
7970 if (args_owned) {
7971 Py_DECREF(args);
7972 }
7973 return NULL;
7974}
7975
7976static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007977 (readbufferproc) unicode_buffer_getreadbuf,
7978 (writebufferproc) unicode_buffer_getwritebuf,
7979 (segcountproc) unicode_buffer_getsegcount,
7980 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981};
7982
Jeremy Hylton938ace62002-07-17 16:30:39 +00007983static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007984unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7985
Tim Peters6d6c1a32001-08-02 04:15:00 +00007986static PyObject *
7987unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7988{
7989 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007990 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007991 char *encoding = NULL;
7992 char *errors = NULL;
7993
Guido van Rossume023fe02001-08-30 03:12:59 +00007994 if (type != &PyUnicode_Type)
7995 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007996 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7997 kwlist, &x, &encoding, &errors))
7998 return NULL;
7999 if (x == NULL)
8000 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008001 if (encoding == NULL && errors == NULL)
8002 return PyObject_Unicode(x);
8003 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008004 return PyUnicode_FromEncodedObject(x, encoding, errors);
8005}
8006
Guido van Rossume023fe02001-08-30 03:12:59 +00008007static PyObject *
8008unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8009{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008010 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008011 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008012
8013 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8014 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8015 if (tmp == NULL)
8016 return NULL;
8017 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008018 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008019 if (pnew == NULL) {
8020 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008021 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008022 }
Neal Norwitzb3635f92008-03-18 04:17:36 +00008023 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008024 if (pnew->str == NULL) {
8025 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008026 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008027 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008028 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008029 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008030 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8031 pnew->length = n;
8032 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008033 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008034 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008035}
8036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008037PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008038"unicode(string [, encoding[, errors]]) -> object\n\
8039\n\
8040Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008041encoding defaults to the current default string encoding.\n\
8042errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008043
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044PyTypeObject PyUnicode_Type = {
8045 PyObject_HEAD_INIT(&PyType_Type)
8046 0, /* ob_size */
8047 "unicode", /* tp_name */
8048 sizeof(PyUnicodeObject), /* tp_size */
8049 0, /* tp_itemsize */
8050 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008051 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008053 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008055 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008056 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008057 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008059 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060 (hashfunc) unicode_hash, /* tp_hash*/
8061 0, /* tp_call*/
8062 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008063 PyObject_GenericGetAttr, /* tp_getattro */
8064 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008066 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8067 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008068 unicode_doc, /* tp_doc */
8069 0, /* tp_traverse */
8070 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008071 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008072 0, /* tp_weaklistoffset */
8073 0, /* tp_iter */
8074 0, /* tp_iternext */
8075 unicode_methods, /* tp_methods */
8076 0, /* tp_members */
8077 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008078 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008079 0, /* tp_dict */
8080 0, /* tp_descr_get */
8081 0, /* tp_descr_set */
8082 0, /* tp_dictoffset */
8083 0, /* tp_init */
8084 0, /* tp_alloc */
8085 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008086 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008087};
8088
8089/* Initialize the Unicode implementation */
8090
Thomas Wouters78890102000-07-22 19:25:51 +00008091void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008093 int i;
8094
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008095 /* XXX - move this array to unicodectype.c ? */
8096 Py_UNICODE linebreak[] = {
8097 0x000A, /* LINE FEED */
8098 0x000D, /* CARRIAGE RETURN */
8099 0x001C, /* FILE SEPARATOR */
8100 0x001D, /* GROUP SEPARATOR */
8101 0x001E, /* RECORD SEPARATOR */
8102 0x0085, /* NEXT LINE */
8103 0x2028, /* LINE SEPARATOR */
8104 0x2029, /* PARAGRAPH SEPARATOR */
8105 };
8106
Fred Drakee4315f52000-05-09 19:53:39 +00008107 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008108 unicode_freelist = NULL;
8109 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008111 if (!unicode_empty)
8112 return;
8113
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008114 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008115 for (i = 0; i < 256; i++)
8116 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008117 if (PyType_Ready(&PyUnicode_Type) < 0)
8118 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008119
8120 /* initialize the linebreak bloom filter */
8121 bloom_linebreak = make_bloom_mask(
8122 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8123 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008124
8125 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126}
8127
8128/* Finalize the Unicode implementation */
8129
8130void
Thomas Wouters78890102000-07-22 19:25:51 +00008131_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008133 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008134 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008136 Py_XDECREF(unicode_empty);
8137 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008138
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008139 for (i = 0; i < 256; i++) {
8140 if (unicode_latin1[i]) {
8141 Py_DECREF(unicode_latin1[i]);
8142 unicode_latin1[i] = NULL;
8143 }
8144 }
8145
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008146 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 PyUnicodeObject *v = u;
8148 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008149 if (v->str)
Neal Norwitzb3635f92008-03-18 04:17:36 +00008150 PyObject_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008151 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008152 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008154 unicode_freelist = NULL;
8155 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008157
Anthony Baxterac6bd462006-04-13 02:06:09 +00008158#ifdef __cplusplus
8159}
8160#endif
8161
8162
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008163/*
8164Local variables:
8165c-basic-offset: 4
8166indent-tabs-mode: nil
8167End:
8168*/