blob: 74b47964275443729d0b04af56184943b68b44a4 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118// case 0x0009: /* HORIZONTAL TABULATION */
119// case 0x000A: /* LINE FEED */
120// case 0x000B: /* VERTICAL TABULATION */
121// case 0x000C: /* FORM FEED */
122// case 0x000D: /* CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125// case 0x001C: /* FILE SEPARATOR */
126// case 0x001D: /* GROUP SEPARATOR */
127// case 0x001E: /* RECORD SEPARATOR */
128// case 0x001F: /* UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130// case 0x0020: /* SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149// 0x000A, /* LINE FEED */
150// 0x000D, /* CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153// 0x001C, /* FILE SEPARATOR */
154// 0x001D, /* GROUP SEPARATOR */
155// 0x001E, /* RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177 return 0x10FFFF;
178#else
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
198
Christian Heimes4d4f2702008-01-30 11:32:37 +0000199#define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000202
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204{
205 /* calculate simple bloom-style bitmask for a given unicode string */
206
207 long mask;
208 Py_ssize_t i;
209
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
213
214 return mask;
215}
216
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218{
219 Py_ssize_t i;
220
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
224
Fredrik Lundh77633512006-05-23 19:47:35 +0000225 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226}
227
228#define BLOOM_MEMBER(mask, chr, set, setlen)\
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
230
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231/* --- Unicode Object ----------------------------------------------------- */
232
233static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000234int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000235 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236{
237 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000238
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000241 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000246
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000247 if (unicode == unicode_empty ||
248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000250 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 return -1;
254 }
255
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000258 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259 it contains). */
260
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 oldstr = unicode->str;
262 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
263 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000264 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265 PyErr_NoMemory();
266 return -1;
267 }
268 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000269 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000271 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000273 if (unicode->defenc) {
274 Py_DECREF(unicode->defenc);
275 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 }
277 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000278
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return 0;
280}
281
282/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000283 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
285 XXX This allocator could further be enhanced by assuring that the
286 free list never reduces its size below 1.
287
288*/
289
290static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000291PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292{
293 register PyUnicodeObject *unicode;
294
Andrew Dalkee0df7622006-05-27 11:04:36 +0000295 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 if (length == 0 && unicode_empty != NULL) {
297 Py_INCREF(unicode_empty);
298 return unicode_empty;
299 }
300
301 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000302 if (free_list) {
303 unicode = free_list;
304 free_list = *(PyUnicodeObject **)unicode;
305 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000307 /* Keep-Alive optimization: we only upsize the buffer,
308 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000309 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000310 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000311 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000312 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313 }
314 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000315 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000317 }
318 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 }
320 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000321 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (unicode == NULL)
323 return NULL;
324 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
325 }
326
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000327 if (!unicode->str) {
328 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000329 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000330 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000331 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000332 * the caller fails before initializing str -- unicode_resize()
333 * reads str[0], and the Keep-Alive optimization can keep memory
334 * allocated for str alive across a call to unicode_dealloc(unicode).
335 * We don't want unicode_resize to read uninitialized memory in
336 * that case.
337 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000338 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000340 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000341 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000342 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000344
345 onError:
346 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000347 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349}
350
351static
Guido van Rossum9475a232001-10-05 20:51:39 +0000352void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000354 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes5b970ad2008-02-06 13:33:44 +0000355 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000356 /* Keep-Alive optimization */
357 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000358 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 unicode->str = NULL;
360 unicode->length = 0;
361 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000362 if (unicode->defenc) {
363 Py_DECREF(unicode->defenc);
364 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000365 }
366 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000367 *(PyUnicodeObject **)unicode = free_list;
368 free_list = unicode;
369 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000372 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000373 Py_XDECREF(unicode->defenc);
Christian Heimese93237d2007-12-19 02:37:44 +0000374 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 }
376}
377
Martin v. Löwis18e16552006-02-15 17:27:45 +0000378int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379{
380 register PyUnicodeObject *v;
381
382 /* Argument checks */
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 v = (PyUnicodeObject *)*unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000388 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 PyErr_BadInternalCall();
390 return -1;
391 }
392
393 /* Resizing unicode_empty and single character objects is not
394 possible since these are being shared. We simply return a fresh
395 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000396 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000397 (v == unicode_empty || v->length == 1)) {
398 PyUnicodeObject *w = _PyUnicode_New(length);
399 if (w == NULL)
400 return -1;
401 Py_UNICODE_COPY(w->str, v->str,
402 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000403 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000404 *unicode = (PyObject *)w;
405 return 0;
406 }
407
408 /* Note that we don't have to modify *unicode for unshared Unicode
409 objects, since we can modify them in-place. */
410 return unicode_resize(v, length);
411}
412
413/* Internal API for use in unicodeobject.c only ! */
414#define _PyUnicode_Resize(unicodevar, length) \
415 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
416
Guido van Rossumd57fd912000-03-10 22:53:23 +0000417PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000418 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000419{
420 PyUnicodeObject *unicode;
421
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000422 /* If the Unicode data is known at construction time, we can apply
423 some optimizations which share commonly used objects. */
424 if (u != NULL) {
425
426 /* Optimization for empty strings */
427 if (size == 0 && unicode_empty != NULL) {
428 Py_INCREF(unicode_empty);
429 return (PyObject *)unicode_empty;
430 }
431
432 /* Single character Unicode objects in the Latin-1 range are
433 shared when using this constructor */
434 if (size == 1 && *u < 256) {
435 unicode = unicode_latin1[*u];
436 if (!unicode) {
437 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000438 if (!unicode)
439 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000440 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000441 unicode_latin1[*u] = unicode;
442 }
443 Py_INCREF(unicode);
444 return (PyObject *)unicode;
445 }
446 }
Tim Petersced69f82003-09-16 20:30:58 +0000447
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448 unicode = _PyUnicode_New(size);
449 if (!unicode)
450 return NULL;
451
452 /* Copy the Unicode data into the new object */
453 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000454 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455
456 return (PyObject *)unicode;
457}
458
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000459PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
460{
461 PyUnicodeObject *unicode;
462 /* If the Unicode data is known at construction time, we can apply
463 some optimizations which share commonly used objects.
464 Also, this means the input must be UTF-8, so fall back to the
465 UTF-8 decoder at the end. */
466 if (u != NULL) {
467
468 /* Optimization for empty strings */
469 if (size == 0 && unicode_empty != NULL) {
470 Py_INCREF(unicode_empty);
471 return (PyObject *)unicode_empty;
472 }
473
474 /* Single characters are shared when using this constructor.
475 Restrict to ASCII, since the input must be UTF-8. */
476 if (size == 1 && Py_CHARMASK(*u) < 128) {
477 unicode = unicode_latin1[Py_CHARMASK(*u)];
478 if (!unicode) {
479 unicode = _PyUnicode_New(1);
480 if (!unicode)
481 return NULL;
482 unicode->str[0] = Py_CHARMASK(*u);
483 unicode_latin1[Py_CHARMASK(*u)] = unicode;
484 }
485 Py_INCREF(unicode);
486 return (PyObject *)unicode;
487 }
488
489 return PyUnicode_DecodeUTF8(u, size, NULL);
490 }
491
492 unicode = _PyUnicode_New(size);
493 if (!unicode)
494 return NULL;
495
496 return (PyObject *)unicode;
497}
498
499PyObject *PyUnicode_FromString(const char *u)
500{
501 size_t size = strlen(u);
502 if (size > PY_SSIZE_T_MAX) {
503 PyErr_SetString(PyExc_OverflowError, "input too long");
504 return NULL;
505 }
506
507 return PyUnicode_FromStringAndSize(u, size);
508}
509
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510#ifdef HAVE_WCHAR_H
511
512PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000513 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514{
515 PyUnicodeObject *unicode;
516
517 if (w == NULL) {
518 PyErr_BadInternalCall();
519 return NULL;
520 }
521
522 unicode = _PyUnicode_New(size);
523 if (!unicode)
524 return NULL;
525
526 /* Copy the wchar_t data into the new object */
527#ifdef HAVE_USABLE_WCHAR_T
528 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000529#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530 {
531 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000532 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000533 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000534 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000535 *u++ = *w++;
536 }
537#endif
538
539 return (PyObject *)unicode;
540}
541
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000542static void
543makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
544{
545 *fmt++ = '%';
546 if (width) {
547 if (zeropad)
548 *fmt++ = '0';
549 fmt += sprintf(fmt, "%d", width);
550 }
551 if (precision)
552 fmt += sprintf(fmt, ".%d", precision);
553 if (longflag)
554 *fmt++ = 'l';
555 else if (size_tflag) {
556 char *f = PY_FORMAT_SIZE_T;
557 while (*f)
558 *fmt++ = *f++;
559 }
560 *fmt++ = c;
561 *fmt = '\0';
562}
563
564#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
565
566PyObject *
567PyUnicode_FromFormatV(const char *format, va_list vargs)
568{
569 va_list count;
570 Py_ssize_t callcount = 0;
571 PyObject **callresults = NULL;
572 PyObject **callresult = NULL;
573 Py_ssize_t n = 0;
574 int width = 0;
575 int precision = 0;
576 int zeropad;
577 const char* f;
578 Py_UNICODE *s;
579 PyObject *string;
580 /* used by sprintf */
581 char buffer[21];
582 /* use abuffer instead of buffer, if we need more space
583 * (which can happen if there's a format specifier with width). */
584 char *abuffer = NULL;
585 char *realbuffer;
586 Py_ssize_t abuffersize = 0;
587 char fmt[60]; /* should be enough for %0width.precisionld */
588 const char *copy;
589
590#ifdef VA_LIST_IS_ARRAY
591 Py_MEMCPY(count, vargs, sizeof(va_list));
592#else
593#ifdef __va_copy
594 __va_copy(count, vargs);
595#else
596 count = vargs;
597#endif
598#endif
599 /* step 1: count the number of %S/%R format specifications
600 * (we call PyObject_Str()/PyObject_Repr() for these objects
601 * once during step 3 and put the result in an array) */
602 for (f = format; *f; f++) {
603 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
604 ++callcount;
605 }
606 /* step 2: allocate memory for the results of
607 * PyObject_Str()/PyObject_Repr() calls */
608 if (callcount) {
609 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
610 if (!callresults) {
611 PyErr_NoMemory();
612 return NULL;
613 }
614 callresult = callresults;
615 }
616 /* step 3: figure out how large a buffer we need */
617 for (f = format; *f; f++) {
618 if (*f == '%') {
619 const char* p = f;
620 width = 0;
621 while (isdigit(*f))
622 width = (width*10) + *f++ - '0';
623 while (*++f && *f != '%' && !isalpha(*f))
624 ;
625
626 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
627 * they don't affect the amount of space we reserve.
628 */
629 if ((*f == 'l' || *f == 'z') &&
630 (f[1] == 'd' || f[1] == 'u'))
631 ++f;
632
633 switch (*f) {
634 case 'c':
635 (void)va_arg(count, int);
636 /* fall through... */
637 case '%':
638 n++;
639 break;
640 case 'd': case 'u': case 'i': case 'x':
641 (void) va_arg(count, int);
642 /* 20 bytes is enough to hold a 64-bit
643 integer. Decimal takes the most space.
644 This isn't enough for octal.
645 If a width is specified we need more
646 (which we allocate later). */
647 if (width < 20)
648 width = 20;
649 n += width;
650 if (abuffersize < width)
651 abuffersize = width;
652 break;
653 case 's':
654 {
655 /* UTF-8 */
656 unsigned char*s;
657 s = va_arg(count, unsigned char*);
658 while (*s) {
659 if (*s < 128) {
660 n++; s++;
661 } else if (*s < 0xc0) {
662 /* invalid UTF-8 */
663 n++; s++;
664 } else if (*s < 0xc0) {
665 n++;
666 s++; if(!*s)break;
667 s++;
668 } else if (*s < 0xe0) {
669 n++;
670 s++; if(!*s)break;
671 s++; if(!*s)break;
672 s++;
673 } else {
674 #ifdef Py_UNICODE_WIDE
675 n++;
676 #else
677 n+=2;
678 #endif
679 s++; if(!*s)break;
680 s++; if(!*s)break;
681 s++; if(!*s)break;
682 s++;
683 }
684 }
685 break;
686 }
687 case 'U':
688 {
689 PyObject *obj = va_arg(count, PyObject *);
690 assert(obj && PyUnicode_Check(obj));
691 n += PyUnicode_GET_SIZE(obj);
692 break;
693 }
694 case 'V':
695 {
696 PyObject *obj = va_arg(count, PyObject *);
697 const char *str = va_arg(count, const char *);
698 assert(obj || str);
699 assert(!obj || PyUnicode_Check(obj));
700 if (obj)
701 n += PyUnicode_GET_SIZE(obj);
702 else
703 n += strlen(str);
704 break;
705 }
706 case 'S':
707 {
708 PyObject *obj = va_arg(count, PyObject *);
709 PyObject *str;
710 assert(obj);
711 str = PyObject_Str(obj);
712 if (!str)
713 goto fail;
714 n += PyUnicode_GET_SIZE(str);
715 /* Remember the str and switch to the next slot */
716 *callresult++ = str;
717 break;
718 }
719 case 'R':
720 {
721 PyObject *obj = va_arg(count, PyObject *);
722 PyObject *repr;
723 assert(obj);
724 repr = PyObject_Repr(obj);
725 if (!repr)
726 goto fail;
727 n += PyUnicode_GET_SIZE(repr);
728 /* Remember the repr and switch to the next slot */
729 *callresult++ = repr;
730 break;
731 }
732 case 'p':
733 (void) va_arg(count, int);
734 /* maximum 64-bit pointer representation:
735 * 0xffffffffffffffff
736 * so 19 characters is enough.
737 * XXX I count 18 -- what's the extra for?
738 */
739 n += 19;
740 break;
741 default:
742 /* if we stumble upon an unknown
743 formatting code, copy the rest of
744 the format string to the output
745 string. (we cannot just skip the
746 code, since there's no way to know
747 what's in the argument list) */
748 n += strlen(p);
749 goto expand;
750 }
751 } else
752 n++;
753 }
754 expand:
755 if (abuffersize > 20) {
756 abuffer = PyMem_Malloc(abuffersize);
757 if (!abuffer) {
758 PyErr_NoMemory();
759 goto fail;
760 }
761 realbuffer = abuffer;
762 }
763 else
764 realbuffer = buffer;
765 /* step 4: fill the buffer */
766 /* Since we've analyzed how much space we need for the worst case,
767 we don't have to resize the string.
768 There can be no errors beyond this point. */
769 string = PyUnicode_FromUnicode(NULL, n);
770 if (!string)
771 goto fail;
772
773 s = PyUnicode_AS_UNICODE(string);
774 callresult = callresults;
775
776 for (f = format; *f; f++) {
777 if (*f == '%') {
778 const char* p = f++;
779 int longflag = 0;
780 int size_tflag = 0;
781 zeropad = (*f == '0');
782 /* parse the width.precision part */
783 width = 0;
784 while (isdigit(*f))
785 width = (width*10) + *f++ - '0';
786 precision = 0;
787 if (*f == '.') {
788 f++;
789 while (isdigit(*f))
790 precision = (precision*10) + *f++ - '0';
791 }
792 /* handle the long flag, but only for %ld and %lu.
793 others can be added when necessary. */
794 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
795 longflag = 1;
796 ++f;
797 }
798 /* handle the size_t flag. */
799 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
800 size_tflag = 1;
801 ++f;
802 }
803
804 switch (*f) {
805 case 'c':
806 *s++ = va_arg(vargs, int);
807 break;
808 case 'd':
809 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
810 if (longflag)
811 sprintf(realbuffer, fmt, va_arg(vargs, long));
812 else if (size_tflag)
813 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
814 else
815 sprintf(realbuffer, fmt, va_arg(vargs, int));
816 appendstring(realbuffer);
817 break;
818 case 'u':
819 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
820 if (longflag)
821 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
822 else if (size_tflag)
823 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
824 else
825 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
826 appendstring(realbuffer);
827 break;
828 case 'i':
829 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
830 sprintf(realbuffer, fmt, va_arg(vargs, int));
831 appendstring(realbuffer);
832 break;
833 case 'x':
834 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
835 sprintf(realbuffer, fmt, va_arg(vargs, int));
836 appendstring(realbuffer);
837 break;
838 case 's':
839 {
840 /* Parameter must be UTF-8 encoded.
841 In case of encoding errors, use
842 the replacement character. */
843 PyObject *u;
844 p = va_arg(vargs, char*);
845 u = PyUnicode_DecodeUTF8(p, strlen(p),
846 "replace");
847 if (!u)
848 goto fail;
849 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
850 PyUnicode_GET_SIZE(u));
851 s += PyUnicode_GET_SIZE(u);
852 Py_DECREF(u);
853 break;
854 }
855 case 'U':
856 {
857 PyObject *obj = va_arg(vargs, PyObject *);
858 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
859 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
860 s += size;
861 break;
862 }
863 case 'V':
864 {
865 PyObject *obj = va_arg(vargs, PyObject *);
866 const char *str = va_arg(vargs, const char *);
867 if (obj) {
868 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
869 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
870 s += size;
871 } else {
872 appendstring(str);
873 }
874 break;
875 }
876 case 'S':
877 case 'R':
878 {
879 Py_UNICODE *ucopy;
880 Py_ssize_t usize;
881 Py_ssize_t upos;
882 /* unused, since we already have the result */
883 (void) va_arg(vargs, PyObject *);
884 ucopy = PyUnicode_AS_UNICODE(*callresult);
885 usize = PyUnicode_GET_SIZE(*callresult);
886 for (upos = 0; upos<usize;)
887 *s++ = ucopy[upos++];
888 /* We're done with the unicode()/repr() => forget it */
889 Py_DECREF(*callresult);
890 /* switch to next unicode()/repr() result */
891 ++callresult;
892 break;
893 }
894 case 'p':
895 sprintf(buffer, "%p", va_arg(vargs, void*));
896 /* %p is ill-defined: ensure leading 0x. */
897 if (buffer[1] == 'X')
898 buffer[1] = 'x';
899 else if (buffer[1] != 'x') {
900 memmove(buffer+2, buffer, strlen(buffer)+1);
901 buffer[0] = '0';
902 buffer[1] = 'x';
903 }
904 appendstring(buffer);
905 break;
906 case '%':
907 *s++ = '%';
908 break;
909 default:
910 appendstring(p);
911 goto end;
912 }
913 } else
914 *s++ = *f;
915 }
916
917 end:
918 if (callresults)
919 PyMem_Free(callresults);
920 if (abuffer)
921 PyMem_Free(abuffer);
922 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
923 return string;
924 fail:
925 if (callresults) {
926 PyObject **callresult2 = callresults;
927 while (callresult2 < callresult) {
928 Py_DECREF(*callresult2);
929 ++callresult2;
930 }
931 PyMem_Free(callresults);
932 }
933 if (abuffer)
934 PyMem_Free(abuffer);
935 return NULL;
936}
937
938#undef appendstring
939
940PyObject *
941PyUnicode_FromFormat(const char *format, ...)
942{
943 PyObject* ret;
944 va_list vargs;
945
946#ifdef HAVE_STDARG_PROTOTYPES
947 va_start(vargs, format);
948#else
949 va_start(vargs);
950#endif
951 ret = PyUnicode_FromFormatV(format, vargs);
952 va_end(vargs);
953 return ret;
954}
955
Martin v. Löwis18e16552006-02-15 17:27:45 +0000956Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
957 wchar_t *w,
958 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000959{
960 if (unicode == NULL) {
961 PyErr_BadInternalCall();
962 return -1;
963 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000964
965 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000966 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000967 size = PyUnicode_GET_SIZE(unicode) + 1;
968
Guido van Rossumd57fd912000-03-10 22:53:23 +0000969#ifdef HAVE_USABLE_WCHAR_T
970 memcpy(w, unicode->str, size * sizeof(wchar_t));
971#else
972 {
973 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000974 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000975 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000976 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000977 *w++ = *u++;
978 }
979#endif
980
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000981 if (size > PyUnicode_GET_SIZE(unicode))
982 return PyUnicode_GET_SIZE(unicode);
983 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000984 return size;
985}
986
987#endif
988
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000989PyObject *PyUnicode_FromOrdinal(int ordinal)
990{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000991 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000992
993#ifdef Py_UNICODE_WIDE
994 if (ordinal < 0 || ordinal > 0x10ffff) {
995 PyErr_SetString(PyExc_ValueError,
996 "unichr() arg not in range(0x110000) "
997 "(wide Python build)");
998 return NULL;
999 }
1000#else
1001 if (ordinal < 0 || ordinal > 0xffff) {
1002 PyErr_SetString(PyExc_ValueError,
1003 "unichr() arg not in range(0x10000) "
1004 "(narrow Python build)");
1005 return NULL;
1006 }
1007#endif
1008
Hye-Shik Chang40574832004-04-06 07:24:51 +00001009 s[0] = (Py_UNICODE)ordinal;
1010 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001011}
1012
Guido van Rossumd57fd912000-03-10 22:53:23 +00001013PyObject *PyUnicode_FromObject(register PyObject *obj)
1014{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001015 /* XXX Perhaps we should make this API an alias of
1016 PyObject_Unicode() instead ?! */
1017 if (PyUnicode_CheckExact(obj)) {
1018 Py_INCREF(obj);
1019 return obj;
1020 }
1021 if (PyUnicode_Check(obj)) {
1022 /* For a Unicode subtype that's not a Unicode object,
1023 return a true Unicode object with the same data. */
1024 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1025 PyUnicode_GET_SIZE(obj));
1026 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001027 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1028}
1029
1030PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1031 const char *encoding,
1032 const char *errors)
1033{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001034 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001035 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001036 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001037
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 if (obj == NULL) {
1039 PyErr_BadInternalCall();
1040 return NULL;
1041 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001042
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001043#if 0
1044 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001045 that no encodings is given and then redirect to
1046 PyObject_Unicode() which then applies the additional logic for
1047 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001048
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001049 NOTE: This API should really only be used for object which
1050 represent *encoded* Unicode !
1051
1052 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001053 if (PyUnicode_Check(obj)) {
1054 if (encoding) {
1055 PyErr_SetString(PyExc_TypeError,
1056 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001057 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001058 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001059 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001060 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001061#else
1062 if (PyUnicode_Check(obj)) {
1063 PyErr_SetString(PyExc_TypeError,
1064 "decoding Unicode is not supported");
1065 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001066 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001067#endif
1068
1069 /* Coerce object */
1070 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001071 s = PyString_AS_STRING(obj);
1072 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001073 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001074 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1075 /* Overwrite the error message with something more useful in
1076 case of a TypeError. */
1077 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001078 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001079 "coercing to Unicode: need string or buffer, "
1080 "%.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00001081 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001082 goto onError;
1083 }
Tim Petersced69f82003-09-16 20:30:58 +00001084
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001085 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086 if (len == 0) {
1087 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001088 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089 }
Tim Petersced69f82003-09-16 20:30:58 +00001090 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001091 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001092
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001093 return v;
1094
1095 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001096 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097}
1098
1099PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001100 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 const char *encoding,
1102 const char *errors)
1103{
1104 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001105
1106 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001107 encoding = PyUnicode_GetDefaultEncoding();
1108
1109 /* Shortcuts for common default encodings */
1110 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001112 else if (strcmp(encoding, "latin-1") == 0)
1113 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001114#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1115 else if (strcmp(encoding, "mbcs") == 0)
1116 return PyUnicode_DecodeMBCS(s, size, errors);
1117#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001118 else if (strcmp(encoding, "ascii") == 0)
1119 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120
1121 /* Decode via the codec registry */
1122 buffer = PyBuffer_FromMemory((void *)s, size);
1123 if (buffer == NULL)
1124 goto onError;
1125 unicode = PyCodec_Decode(buffer, encoding, errors);
1126 if (unicode == NULL)
1127 goto onError;
1128 if (!PyUnicode_Check(unicode)) {
1129 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001130 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001131 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 Py_DECREF(unicode);
1133 goto onError;
1134 }
1135 Py_DECREF(buffer);
1136 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001137
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 onError:
1139 Py_XDECREF(buffer);
1140 return NULL;
1141}
1142
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001143PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1144 const char *encoding,
1145 const char *errors)
1146{
1147 PyObject *v;
1148
1149 if (!PyUnicode_Check(unicode)) {
1150 PyErr_BadArgument();
1151 goto onError;
1152 }
1153
1154 if (encoding == NULL)
1155 encoding = PyUnicode_GetDefaultEncoding();
1156
1157 /* Decode via the codec registry */
1158 v = PyCodec_Decode(unicode, encoding, errors);
1159 if (v == NULL)
1160 goto onError;
1161 return v;
1162
1163 onError:
1164 return NULL;
1165}
1166
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001168 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 const char *encoding,
1170 const char *errors)
1171{
1172 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001173
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 unicode = PyUnicode_FromUnicode(s, size);
1175 if (unicode == NULL)
1176 return NULL;
1177 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1178 Py_DECREF(unicode);
1179 return v;
1180}
1181
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001182PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1183 const char *encoding,
1184 const char *errors)
1185{
1186 PyObject *v;
1187
1188 if (!PyUnicode_Check(unicode)) {
1189 PyErr_BadArgument();
1190 goto onError;
1191 }
1192
1193 if (encoding == NULL)
1194 encoding = PyUnicode_GetDefaultEncoding();
1195
1196 /* Encode via the codec registry */
1197 v = PyCodec_Encode(unicode, encoding, errors);
1198 if (v == NULL)
1199 goto onError;
1200 return v;
1201
1202 onError:
1203 return NULL;
1204}
1205
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1207 const char *encoding,
1208 const char *errors)
1209{
1210 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001211
Guido van Rossumd57fd912000-03-10 22:53:23 +00001212 if (!PyUnicode_Check(unicode)) {
1213 PyErr_BadArgument();
1214 goto onError;
1215 }
Fred Drakee4315f52000-05-09 19:53:39 +00001216
Tim Petersced69f82003-09-16 20:30:58 +00001217 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001218 encoding = PyUnicode_GetDefaultEncoding();
1219
1220 /* Shortcuts for common default encodings */
1221 if (errors == NULL) {
1222 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001223 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001224 else if (strcmp(encoding, "latin-1") == 0)
1225 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001226#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1227 else if (strcmp(encoding, "mbcs") == 0)
1228 return PyUnicode_AsMBCSString(unicode);
1229#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001230 else if (strcmp(encoding, "ascii") == 0)
1231 return PyUnicode_AsASCIIString(unicode);
1232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233
1234 /* Encode via the codec registry */
1235 v = PyCodec_Encode(unicode, encoding, errors);
1236 if (v == NULL)
1237 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238 if (!PyString_Check(v)) {
1239 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001240 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001241 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 Py_DECREF(v);
1243 goto onError;
1244 }
1245 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001246
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 onError:
1248 return NULL;
1249}
1250
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001251PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1252 const char *errors)
1253{
1254 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1255
1256 if (v)
1257 return v;
1258 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1259 if (v && errors == NULL)
1260 ((PyUnicodeObject *)unicode)->defenc = v;
1261 return v;
1262}
1263
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1265{
1266 if (!PyUnicode_Check(unicode)) {
1267 PyErr_BadArgument();
1268 goto onError;
1269 }
1270 return PyUnicode_AS_UNICODE(unicode);
1271
1272 onError:
1273 return NULL;
1274}
1275
Martin v. Löwis18e16552006-02-15 17:27:45 +00001276Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277{
1278 if (!PyUnicode_Check(unicode)) {
1279 PyErr_BadArgument();
1280 goto onError;
1281 }
1282 return PyUnicode_GET_SIZE(unicode);
1283
1284 onError:
1285 return -1;
1286}
1287
Thomas Wouters78890102000-07-22 19:25:51 +00001288const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001289{
1290 return unicode_default_encoding;
1291}
1292
1293int PyUnicode_SetDefaultEncoding(const char *encoding)
1294{
1295 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001296
Fred Drakee4315f52000-05-09 19:53:39 +00001297 /* Make sure the encoding is valid. As side effect, this also
1298 loads the encoding into the codec registry cache. */
1299 v = _PyCodec_Lookup(encoding);
1300 if (v == NULL)
1301 goto onError;
1302 Py_DECREF(v);
1303 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +00001304 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +00001305 sizeof(unicode_default_encoding));
1306 return 0;
1307
1308 onError:
1309 return -1;
1310}
1311
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001312/* error handling callback helper:
1313 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001314 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001315 and adjust various state variables.
1316 return 0 on success, -1 on error
1317*/
1318
1319static
1320int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1321 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +00001322 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1323 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001324 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001325{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001326 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001327
1328 PyObject *restuple = NULL;
1329 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001330 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1331 Py_ssize_t requiredsize;
1332 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001333 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001334 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001335 int res = -1;
1336
1337 if (*errorHandler == NULL) {
1338 *errorHandler = PyCodec_LookupError(errors);
1339 if (*errorHandler == NULL)
1340 goto onError;
1341 }
1342
1343 if (*exceptionObject == NULL) {
1344 *exceptionObject = PyUnicodeDecodeError_Create(
1345 encoding, input, insize, *startinpos, *endinpos, reason);
1346 if (*exceptionObject == NULL)
1347 goto onError;
1348 }
1349 else {
1350 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1351 goto onError;
1352 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1353 goto onError;
1354 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1355 goto onError;
1356 }
1357
1358 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1359 if (restuple == NULL)
1360 goto onError;
1361 if (!PyTuple_Check(restuple)) {
1362 PyErr_Format(PyExc_TypeError, &argparse[4]);
1363 goto onError;
1364 }
1365 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1366 goto onError;
1367 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001368 newpos = insize+newpos;
1369 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001370 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001371 goto onError;
1372 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001373
1374 /* need more space? (at least enough for what we
1375 have+the replacement+the rest of the string (starting
1376 at the new input position), so we won't have to check space
1377 when there are no errors in the rest of the string) */
1378 repptr = PyUnicode_AS_UNICODE(repunicode);
1379 repsize = PyUnicode_GET_SIZE(repunicode);
1380 requiredsize = *outpos + repsize + insize-newpos;
1381 if (requiredsize > outsize) {
1382 if (requiredsize<2*outsize)
1383 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001384 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 goto onError;
1386 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1387 }
1388 *endinpos = newpos;
1389 *inptr = input + newpos;
1390 Py_UNICODE_COPY(*outptr, repptr, repsize);
1391 *outptr += repsize;
1392 *outpos += repsize;
1393 /* we made it! */
1394 res = 0;
1395
1396 onError:
1397 Py_XDECREF(restuple);
1398 return res;
1399}
1400
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001401/* --- UTF-7 Codec -------------------------------------------------------- */
1402
1403/* see RFC2152 for details */
1404
Tim Petersced69f82003-09-16 20:30:58 +00001405static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001406char utf7_special[128] = {
1407 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1408 encoded:
1409 0 - not special
1410 1 - special
1411 2 - whitespace (optional)
1412 3 - RFC2152 Set O (optional) */
1413 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1414 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1415 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1416 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1417 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1418 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1419 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1421
1422};
1423
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001424/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1425 warnings about the comparison always being false; since
1426 utf7_special[0] is 1, we can safely make that one comparison
1427 true */
1428
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001429#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001430 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001431 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001432 (encodeO && (utf7_special[(c)] == 3)))
1433
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001434#define B64(n) \
1435 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1436#define B64CHAR(c) \
1437 (isalnum(c) || (c) == '+' || (c) == '/')
1438#define UB64(c) \
1439 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1440 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001441
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001442#define ENCODE(out, ch, bits) \
1443 while (bits >= 6) { \
1444 *out++ = B64(ch >> (bits-6)); \
1445 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001446 }
1447
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001448#define DECODE(out, ch, bits, surrogate) \
1449 while (bits >= 16) { \
1450 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1451 bits -= 16; \
1452 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001453 /* We have already generated an error for the high surrogate \
1454 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001455 surrogate = 0; \
1456 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001457 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001458 it in a 16-bit character */ \
1459 surrogate = 1; \
1460 errmsg = "code pairs are not supported"; \
1461 goto utf7Error; \
1462 } else { \
1463 *out++ = outCh; \
1464 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001465 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001466
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001467PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001468 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001469 const char *errors)
1470{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001471 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1472}
1473
1474PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1475 Py_ssize_t size,
1476 const char *errors,
1477 Py_ssize_t *consumed)
1478{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001479 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001480 Py_ssize_t startinpos;
1481 Py_ssize_t endinpos;
1482 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001483 const char *e;
1484 PyUnicodeObject *unicode;
1485 Py_UNICODE *p;
1486 const char *errmsg = "";
1487 int inShift = 0;
1488 unsigned int bitsleft = 0;
1489 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001490 int surrogate = 0;
1491 PyObject *errorHandler = NULL;
1492 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001493
1494 unicode = _PyUnicode_New(size);
1495 if (!unicode)
1496 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001497 if (size == 0) {
1498 if (consumed)
1499 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001500 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001501 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001502
1503 p = unicode->str;
1504 e = s + size;
1505
1506 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001507 Py_UNICODE ch;
1508 restart:
1509 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001510
1511 if (inShift) {
1512 if ((ch == '-') || !B64CHAR(ch)) {
1513 inShift = 0;
1514 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001515
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001516 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1517 if (bitsleft >= 6) {
1518 /* The shift sequence has a partial character in it. If
1519 bitsleft < 6 then we could just classify it as padding
1520 but that is not the case here */
1521
1522 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001523 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001524 }
1525 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001526 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001527 here so indicate the potential of a misencoded character. */
1528
1529 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1530 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1531 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001532 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001533 }
1534
1535 if (ch == '-') {
1536 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001537 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001538 inShift = 1;
1539 }
1540 } else if (SPECIAL(ch,0,0)) {
1541 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001542 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001543 } else {
1544 *p++ = ch;
1545 }
1546 } else {
1547 charsleft = (charsleft << 6) | UB64(ch);
1548 bitsleft += 6;
1549 s++;
1550 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1551 }
1552 }
1553 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001554 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001555 s++;
1556 if (s < e && *s == '-') {
1557 s++;
1558 *p++ = '+';
1559 } else
1560 {
1561 inShift = 1;
1562 bitsleft = 0;
1563 }
1564 }
1565 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001566 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001567 errmsg = "unexpected special character";
1568 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001569 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001570 }
1571 else {
1572 *p++ = ch;
1573 s++;
1574 }
1575 continue;
1576 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001577 outpos = p-PyUnicode_AS_UNICODE(unicode);
1578 endinpos = s-starts;
1579 if (unicode_decode_call_errorhandler(
1580 errors, &errorHandler,
1581 "utf7", errmsg,
1582 starts, size, &startinpos, &endinpos, &exc, &s,
1583 (PyObject **)&unicode, &outpos, &p))
1584 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001585 }
1586
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001587 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001588 outpos = p-PyUnicode_AS_UNICODE(unicode);
1589 endinpos = size;
1590 if (unicode_decode_call_errorhandler(
1591 errors, &errorHandler,
1592 "utf7", "unterminated shift sequence",
1593 starts, size, &startinpos, &endinpos, &exc, &s,
1594 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001595 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 if (s < e)
1597 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001598 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001599 if (consumed) {
1600 if(inShift)
1601 *consumed = startinpos;
1602 else
1603 *consumed = s-starts;
1604 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001605
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001606 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001607 goto onError;
1608
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001609 Py_XDECREF(errorHandler);
1610 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611 return (PyObject *)unicode;
1612
1613onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001614 Py_XDECREF(errorHandler);
1615 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001616 Py_DECREF(unicode);
1617 return NULL;
1618}
1619
1620
1621PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001622 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001623 int encodeSetO,
1624 int encodeWhiteSpace,
1625 const char *errors)
1626{
1627 PyObject *v;
1628 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001629 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001630 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001631 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001632 unsigned int bitsleft = 0;
1633 unsigned long charsleft = 0;
1634 char * out;
1635 char * start;
1636
1637 if (size == 0)
1638 return PyString_FromStringAndSize(NULL, 0);
1639
1640 v = PyString_FromStringAndSize(NULL, cbAllocated);
1641 if (v == NULL)
1642 return NULL;
1643
1644 start = out = PyString_AS_STRING(v);
1645 for (;i < size; ++i) {
1646 Py_UNICODE ch = s[i];
1647
1648 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001649 if (ch == '+') {
1650 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001651 *out++ = '-';
1652 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1653 charsleft = ch;
1654 bitsleft = 16;
1655 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001656 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001658 } else {
1659 *out++ = (char) ch;
1660 }
1661 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001662 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1663 *out++ = B64(charsleft << (6-bitsleft));
1664 charsleft = 0;
1665 bitsleft = 0;
1666 /* Characters not in the BASE64 set implicitly unshift the sequence
1667 so no '-' is required, except if the character is itself a '-' */
1668 if (B64CHAR(ch) || ch == '-') {
1669 *out++ = '-';
1670 }
1671 inShift = 0;
1672 *out++ = (char) ch;
1673 } else {
1674 bitsleft += 16;
1675 charsleft = (charsleft << 16) | ch;
1676 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1677
1678 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001679 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001680 or '-' then the shift sequence will be terminated implicitly and we
1681 don't have to insert a '-'. */
1682
1683 if (bitsleft == 0) {
1684 if (i + 1 < size) {
1685 Py_UNICODE ch2 = s[i+1];
1686
1687 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001688
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 } else if (B64CHAR(ch2) || ch2 == '-') {
1690 *out++ = '-';
1691 inShift = 0;
1692 } else {
1693 inShift = 0;
1694 }
1695
1696 }
1697 else {
1698 *out++ = '-';
1699 inShift = 0;
1700 }
1701 }
Tim Petersced69f82003-09-16 20:30:58 +00001702 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001703 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001704 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001705 if (bitsleft) {
1706 *out++= B64(charsleft << (6-bitsleft) );
1707 *out++ = '-';
1708 }
1709
Tim Peters5de98422002-04-27 18:44:32 +00001710 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001711 return v;
1712}
1713
1714#undef SPECIAL
1715#undef B64
1716#undef B64CHAR
1717#undef UB64
1718#undef ENCODE
1719#undef DECODE
1720
Guido van Rossumd57fd912000-03-10 22:53:23 +00001721/* --- UTF-8 Codec -------------------------------------------------------- */
1722
Tim Petersced69f82003-09-16 20:30:58 +00001723static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724char utf8_code_length[256] = {
1725 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1726 illegal prefix. see RFC 2279 for details */
1727 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1728 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1729 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1730 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1731 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1732 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1733 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1734 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1735 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1736 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1737 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1738 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1739 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1740 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1741 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1742 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1743};
1744
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001746 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 const char *errors)
1748{
Walter Dörwald69652032004-09-07 20:24:22 +00001749 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1750}
1751
1752PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001753 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001754 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001755 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001756{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001757 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001759 Py_ssize_t startinpos;
1760 Py_ssize_t endinpos;
1761 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762 const char *e;
1763 PyUnicodeObject *unicode;
1764 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001765 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001766 PyObject *errorHandler = NULL;
1767 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768
1769 /* Note: size will always be longer than the resulting Unicode
1770 character count */
1771 unicode = _PyUnicode_New(size);
1772 if (!unicode)
1773 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001774 if (size == 0) {
1775 if (consumed)
1776 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779
1780 /* Unpack UTF-8 encoded data */
1781 p = unicode->str;
1782 e = s + size;
1783
1784 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001785 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786
1787 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001788 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 s++;
1790 continue;
1791 }
1792
1793 n = utf8_code_length[ch];
1794
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001795 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001796 if (consumed)
1797 break;
1798 else {
1799 errmsg = "unexpected end of data";
1800 startinpos = s-starts;
1801 endinpos = size;
1802 goto utf8Error;
1803 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001804 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805
1806 switch (n) {
1807
1808 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001809 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 startinpos = s-starts;
1811 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001812 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001813
1814 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001815 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 startinpos = s-starts;
1817 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001818 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819
1820 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001821 if ((s[1] & 0xc0) != 0x80) {
1822 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001823 startinpos = s-starts;
1824 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001825 goto utf8Error;
1826 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001828 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001829 startinpos = s-starts;
1830 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001831 errmsg = "illegal encoding";
1832 goto utf8Error;
1833 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001835 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836 break;
1837
1838 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001839 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001840 (s[2] & 0xc0) != 0x80) {
1841 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001842 startinpos = s-starts;
1843 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001844 goto utf8Error;
1845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001847 if (ch < 0x0800) {
1848 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001849 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001850
1851 XXX For wide builds (UCS-4) we should probably try
1852 to recombine the surrogates into a single code
1853 unit.
1854 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001855 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001856 startinpos = s-starts;
1857 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001858 goto utf8Error;
1859 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001861 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001862 break;
1863
1864 case 4:
1865 if ((s[1] & 0xc0) != 0x80 ||
1866 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001867 (s[3] & 0xc0) != 0x80) {
1868 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 startinpos = s-starts;
1870 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001871 goto utf8Error;
1872 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001873 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1874 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1875 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001876 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001877 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001878 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001879 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001880 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001881 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001882 startinpos = s-starts;
1883 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001884 goto utf8Error;
1885 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001886#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001887 *p++ = (Py_UNICODE)ch;
1888#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001889 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001890
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001891 /* translate from 10000..10FFFF to 0..FFFF */
1892 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001893
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001894 /* high surrogate = top 10 bits added to D800 */
1895 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001896
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001897 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001898 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001899#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001900 break;
1901
1902 default:
1903 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001904 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001905 startinpos = s-starts;
1906 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001907 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001908 }
1909 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001910 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001911
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001912 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001913 outpos = p-PyUnicode_AS_UNICODE(unicode);
1914 if (unicode_decode_call_errorhandler(
1915 errors, &errorHandler,
1916 "utf8", errmsg,
1917 starts, size, &startinpos, &endinpos, &exc, &s,
1918 (PyObject **)&unicode, &outpos, &p))
1919 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920 }
Walter Dörwald69652032004-09-07 20:24:22 +00001921 if (consumed)
1922 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923
1924 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001925 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926 goto onError;
1927
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001928 Py_XDECREF(errorHandler);
1929 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930 return (PyObject *)unicode;
1931
1932onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001933 Py_XDECREF(errorHandler);
1934 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935 Py_DECREF(unicode);
1936 return NULL;
1937}
1938
Tim Peters602f7402002-04-27 18:03:26 +00001939/* Allocation strategy: if the string is short, convert into a stack buffer
1940 and allocate exactly as much space needed at the end. Else allocate the
1941 maximum possible needed (4 result bytes per Unicode character), and return
1942 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001943*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001944PyObject *
1945PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001946 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001947 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948{
Tim Peters602f7402002-04-27 18:03:26 +00001949#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001950
Martin v. Löwis18e16552006-02-15 17:27:45 +00001951 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001952 PyObject *v; /* result string object */
1953 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001954 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001955 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001956 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001957
Tim Peters602f7402002-04-27 18:03:26 +00001958 assert(s != NULL);
1959 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960
Tim Peters602f7402002-04-27 18:03:26 +00001961 if (size <= MAX_SHORT_UNICHARS) {
1962 /* Write into the stack buffer; nallocated can't overflow.
1963 * At the end, we'll allocate exactly as much heap space as it
1964 * turns out we need.
1965 */
1966 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1967 v = NULL; /* will allocate after we're done */
1968 p = stackbuf;
1969 }
1970 else {
1971 /* Overallocate on the heap, and give the excess back at the end. */
1972 nallocated = size * 4;
1973 if (nallocated / 4 != size) /* overflow! */
1974 return PyErr_NoMemory();
1975 v = PyString_FromStringAndSize(NULL, nallocated);
1976 if (v == NULL)
1977 return NULL;
1978 p = PyString_AS_STRING(v);
1979 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001980
Tim Peters602f7402002-04-27 18:03:26 +00001981 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001982 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001983
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001984 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001985 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001987
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001989 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001990 *p++ = (char)(0xc0 | (ch >> 6));
1991 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001992 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001993 else {
Tim Peters602f7402002-04-27 18:03:26 +00001994 /* Encode UCS2 Unicode ordinals */
1995 if (ch < 0x10000) {
1996 /* Special case: check for high surrogate */
1997 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1998 Py_UCS4 ch2 = s[i];
1999 /* Check for low surrogate and combine the two to
2000 form a UCS4 value */
2001 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002002 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002003 i++;
2004 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002005 }
Tim Peters602f7402002-04-27 18:03:26 +00002006 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002007 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002008 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002009 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2010 *p++ = (char)(0x80 | (ch & 0x3f));
2011 continue;
2012 }
2013encodeUCS4:
2014 /* Encode UCS4 Unicode ordinals */
2015 *p++ = (char)(0xf0 | (ch >> 18));
2016 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2017 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2018 *p++ = (char)(0x80 | (ch & 0x3f));
2019 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002020 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002021
Tim Peters602f7402002-04-27 18:03:26 +00002022 if (v == NULL) {
2023 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002024 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002025 assert(nneeded <= nallocated);
2026 v = PyString_FromStringAndSize(stackbuf, nneeded);
2027 }
2028 else {
2029 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002030 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002031 assert(nneeded <= nallocated);
2032 _PyString_Resize(&v, nneeded);
2033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002035
Tim Peters602f7402002-04-27 18:03:26 +00002036#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037}
2038
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2040{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 if (!PyUnicode_Check(unicode)) {
2042 PyErr_BadArgument();
2043 return NULL;
2044 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002045 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2046 PyUnicode_GET_SIZE(unicode),
2047 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048}
2049
Walter Dörwald6e390802007-08-17 16:41:28 +00002050/* --- UTF-32 Codec ------------------------------------------------------- */
2051
2052PyObject *
2053PyUnicode_DecodeUTF32(const char *s,
2054 Py_ssize_t size,
2055 const char *errors,
2056 int *byteorder)
2057{
2058 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2059}
2060
2061PyObject *
2062PyUnicode_DecodeUTF32Stateful(const char *s,
2063 Py_ssize_t size,
2064 const char *errors,
2065 int *byteorder,
2066 Py_ssize_t *consumed)
2067{
2068 const char *starts = s;
2069 Py_ssize_t startinpos;
2070 Py_ssize_t endinpos;
2071 Py_ssize_t outpos;
2072 PyUnicodeObject *unicode;
2073 Py_UNICODE *p;
2074#ifndef Py_UNICODE_WIDE
2075 int i, pairs;
2076#else
2077 const int pairs = 0;
2078#endif
2079 const unsigned char *q, *e;
2080 int bo = 0; /* assume native ordering by default */
2081 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002082 /* Offsets from q for retrieving bytes in the right order. */
2083#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2084 int iorder[] = {0, 1, 2, 3};
2085#else
2086 int iorder[] = {3, 2, 1, 0};
2087#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002088 PyObject *errorHandler = NULL;
2089 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002090 /* On narrow builds we split characters outside the BMP into two
2091 codepoints => count how much extra space we need. */
2092#ifndef Py_UNICODE_WIDE
2093 for (i = pairs = 0; i < size/4; i++)
2094 if (((Py_UCS4 *)s)[i] >= 0x10000)
2095 pairs++;
2096#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002097
2098 /* This might be one to much, because of a BOM */
2099 unicode = _PyUnicode_New((size+3)/4+pairs);
2100 if (!unicode)
2101 return NULL;
2102 if (size == 0)
2103 return (PyObject *)unicode;
2104
2105 /* Unpack UTF-32 encoded data */
2106 p = unicode->str;
2107 q = (unsigned char *)s;
2108 e = q + size;
2109
2110 if (byteorder)
2111 bo = *byteorder;
2112
2113 /* Check for BOM marks (U+FEFF) in the input and adjust current
2114 byte order setting accordingly. In native mode, the leading BOM
2115 mark is skipped, in all other modes, it is copied to the output
2116 stream as-is (giving a ZWNBSP character). */
2117 if (bo == 0) {
2118 if (size >= 4) {
2119 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2120 (q[iorder[1]] << 8) | q[iorder[0]];
2121#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2122 if (bom == 0x0000FEFF) {
2123 q += 4;
2124 bo = -1;
2125 }
2126 else if (bom == 0xFFFE0000) {
2127 q += 4;
2128 bo = 1;
2129 }
2130#else
2131 if (bom == 0x0000FEFF) {
2132 q += 4;
2133 bo = 1;
2134 }
2135 else if (bom == 0xFFFE0000) {
2136 q += 4;
2137 bo = -1;
2138 }
2139#endif
2140 }
2141 }
2142
2143 if (bo == -1) {
2144 /* force LE */
2145 iorder[0] = 0;
2146 iorder[1] = 1;
2147 iorder[2] = 2;
2148 iorder[3] = 3;
2149 }
2150 else if (bo == 1) {
2151 /* force BE */
2152 iorder[0] = 3;
2153 iorder[1] = 2;
2154 iorder[2] = 1;
2155 iorder[3] = 0;
2156 }
2157
2158 while (q < e) {
2159 Py_UCS4 ch;
2160 /* remaining bytes at the end? (size should be divisible by 4) */
2161 if (e-q<4) {
2162 if (consumed)
2163 break;
2164 errmsg = "truncated data";
2165 startinpos = ((const char *)q)-starts;
2166 endinpos = ((const char *)e)-starts;
2167 goto utf32Error;
2168 /* The remaining input chars are ignored if the callback
2169 chooses to skip the input */
2170 }
2171 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2172 (q[iorder[1]] << 8) | q[iorder[0]];
2173
2174 if (ch >= 0x110000)
2175 {
2176 errmsg = "codepoint not in range(0x110000)";
2177 startinpos = ((const char *)q)-starts;
2178 endinpos = startinpos+4;
2179 goto utf32Error;
2180 }
2181#ifndef Py_UNICODE_WIDE
2182 if (ch >= 0x10000)
2183 {
2184 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2185 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2186 }
2187 else
2188#endif
2189 *p++ = ch;
2190 q += 4;
2191 continue;
2192 utf32Error:
2193 outpos = p-PyUnicode_AS_UNICODE(unicode);
2194 if (unicode_decode_call_errorhandler(
2195 errors, &errorHandler,
2196 "utf32", errmsg,
2197 starts, size, &startinpos, &endinpos, &exc, &s,
2198 (PyObject **)&unicode, &outpos, &p))
2199 goto onError;
2200 }
2201
2202 if (byteorder)
2203 *byteorder = bo;
2204
2205 if (consumed)
2206 *consumed = (const char *)q-starts;
2207
2208 /* Adjust length */
2209 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2210 goto onError;
2211
2212 Py_XDECREF(errorHandler);
2213 Py_XDECREF(exc);
2214 return (PyObject *)unicode;
2215
2216onError:
2217 Py_DECREF(unicode);
2218 Py_XDECREF(errorHandler);
2219 Py_XDECREF(exc);
2220 return NULL;
2221}
2222
2223PyObject *
2224PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2225 Py_ssize_t size,
2226 const char *errors,
2227 int byteorder)
2228{
2229 PyObject *v;
2230 unsigned char *p;
2231#ifndef Py_UNICODE_WIDE
2232 int i, pairs;
2233#else
2234 const int pairs = 0;
2235#endif
2236 /* Offsets from p for storing byte pairs in the right order. */
2237#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2238 int iorder[] = {0, 1, 2, 3};
2239#else
2240 int iorder[] = {3, 2, 1, 0};
2241#endif
2242
2243#define STORECHAR(CH) \
2244 do { \
2245 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2246 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2247 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2248 p[iorder[0]] = (CH) & 0xff; \
2249 p += 4; \
2250 } while(0)
2251
2252 /* In narrow builds we can output surrogate pairs as one codepoint,
2253 so we need less space. */
2254#ifndef Py_UNICODE_WIDE
2255 for (i = pairs = 0; i < size-1; i++)
2256 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2257 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2258 pairs++;
2259#endif
2260 v = PyString_FromStringAndSize(NULL,
2261 4 * (size - pairs + (byteorder == 0)));
2262 if (v == NULL)
2263 return NULL;
2264
2265 p = (unsigned char *)PyString_AS_STRING(v);
2266 if (byteorder == 0)
2267 STORECHAR(0xFEFF);
2268 if (size == 0)
2269 return v;
2270
2271 if (byteorder == -1) {
2272 /* force LE */
2273 iorder[0] = 0;
2274 iorder[1] = 1;
2275 iorder[2] = 2;
2276 iorder[3] = 3;
2277 }
2278 else if (byteorder == 1) {
2279 /* force BE */
2280 iorder[0] = 3;
2281 iorder[1] = 2;
2282 iorder[2] = 1;
2283 iorder[3] = 0;
2284 }
2285
2286 while (size-- > 0) {
2287 Py_UCS4 ch = *s++;
2288#ifndef Py_UNICODE_WIDE
2289 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2290 Py_UCS4 ch2 = *s;
2291 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2292 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2293 s++;
2294 size--;
2295 }
2296 }
2297#endif
2298 STORECHAR(ch);
2299 }
2300 return v;
2301#undef STORECHAR
2302}
2303
2304PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2305{
2306 if (!PyUnicode_Check(unicode)) {
2307 PyErr_BadArgument();
2308 return NULL;
2309 }
2310 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2311 PyUnicode_GET_SIZE(unicode),
2312 NULL,
2313 0);
2314}
2315
Guido van Rossumd57fd912000-03-10 22:53:23 +00002316/* --- UTF-16 Codec ------------------------------------------------------- */
2317
Tim Peters772747b2001-08-09 22:21:55 +00002318PyObject *
2319PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002320 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002321 const char *errors,
2322 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002323{
Walter Dörwald69652032004-09-07 20:24:22 +00002324 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2325}
2326
2327PyObject *
2328PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002329 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002330 const char *errors,
2331 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002332 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002333{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002334 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002335 Py_ssize_t startinpos;
2336 Py_ssize_t endinpos;
2337 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002338 PyUnicodeObject *unicode;
2339 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002340 const unsigned char *q, *e;
2341 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002342 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002343 /* Offsets from q for retrieving byte pairs in the right order. */
2344#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2345 int ihi = 1, ilo = 0;
2346#else
2347 int ihi = 0, ilo = 1;
2348#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002349 PyObject *errorHandler = NULL;
2350 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351
2352 /* Note: size will always be longer than the resulting Unicode
2353 character count */
2354 unicode = _PyUnicode_New(size);
2355 if (!unicode)
2356 return NULL;
2357 if (size == 0)
2358 return (PyObject *)unicode;
2359
2360 /* Unpack UTF-16 encoded data */
2361 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002362 q = (unsigned char *)s;
2363 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002364
2365 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002366 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002368 /* Check for BOM marks (U+FEFF) in the input and adjust current
2369 byte order setting accordingly. In native mode, the leading BOM
2370 mark is skipped, in all other modes, it is copied to the output
2371 stream as-is (giving a ZWNBSP character). */
2372 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002373 if (size >= 2) {
2374 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002375#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002376 if (bom == 0xFEFF) {
2377 q += 2;
2378 bo = -1;
2379 }
2380 else if (bom == 0xFFFE) {
2381 q += 2;
2382 bo = 1;
2383 }
Tim Petersced69f82003-09-16 20:30:58 +00002384#else
Walter Dörwald69652032004-09-07 20:24:22 +00002385 if (bom == 0xFEFF) {
2386 q += 2;
2387 bo = 1;
2388 }
2389 else if (bom == 0xFFFE) {
2390 q += 2;
2391 bo = -1;
2392 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002393#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002394 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002395 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002396
Tim Peters772747b2001-08-09 22:21:55 +00002397 if (bo == -1) {
2398 /* force LE */
2399 ihi = 1;
2400 ilo = 0;
2401 }
2402 else if (bo == 1) {
2403 /* force BE */
2404 ihi = 0;
2405 ilo = 1;
2406 }
2407
2408 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002409 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002410 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002411 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002412 if (consumed)
2413 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002414 errmsg = "truncated data";
2415 startinpos = ((const char *)q)-starts;
2416 endinpos = ((const char *)e)-starts;
2417 goto utf16Error;
2418 /* The remaining input chars are ignored if the callback
2419 chooses to skip the input */
2420 }
2421 ch = (q[ihi] << 8) | q[ilo];
2422
Tim Peters772747b2001-08-09 22:21:55 +00002423 q += 2;
2424
Guido van Rossumd57fd912000-03-10 22:53:23 +00002425 if (ch < 0xD800 || ch > 0xDFFF) {
2426 *p++ = ch;
2427 continue;
2428 }
2429
2430 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002431 if (q >= e) {
2432 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002433 startinpos = (((const char *)q)-2)-starts;
2434 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002435 goto utf16Error;
2436 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002437 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002438 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2439 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002440 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002441#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002442 *p++ = ch;
2443 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002444#else
2445 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002446#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002447 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002448 }
2449 else {
2450 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002451 startinpos = (((const char *)q)-4)-starts;
2452 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002453 goto utf16Error;
2454 }
2455
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002457 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002458 startinpos = (((const char *)q)-2)-starts;
2459 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002460 /* Fall through to report the error */
2461
2462 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002463 outpos = p-PyUnicode_AS_UNICODE(unicode);
2464 if (unicode_decode_call_errorhandler(
2465 errors, &errorHandler,
2466 "utf16", errmsg,
2467 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2468 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002469 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470 }
2471
2472 if (byteorder)
2473 *byteorder = bo;
2474
Walter Dörwald69652032004-09-07 20:24:22 +00002475 if (consumed)
2476 *consumed = (const char *)q-starts;
2477
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002479 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 goto onError;
2481
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002482 Py_XDECREF(errorHandler);
2483 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 return (PyObject *)unicode;
2485
2486onError:
2487 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002488 Py_XDECREF(errorHandler);
2489 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 return NULL;
2491}
2492
Tim Peters772747b2001-08-09 22:21:55 +00002493PyObject *
2494PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002495 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002496 const char *errors,
2497 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498{
2499 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002500 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002501#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002502 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002503#else
2504 const int pairs = 0;
2505#endif
Tim Peters772747b2001-08-09 22:21:55 +00002506 /* Offsets from p for storing byte pairs in the right order. */
2507#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2508 int ihi = 1, ilo = 0;
2509#else
2510 int ihi = 0, ilo = 1;
2511#endif
2512
2513#define STORECHAR(CH) \
2514 do { \
2515 p[ihi] = ((CH) >> 8) & 0xff; \
2516 p[ilo] = (CH) & 0xff; \
2517 p += 2; \
2518 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002520#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002521 for (i = pairs = 0; i < size; i++)
2522 if (s[i] >= 0x10000)
2523 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002524#endif
Tim Petersced69f82003-09-16 20:30:58 +00002525 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002526 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 if (v == NULL)
2528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529
Tim Peters772747b2001-08-09 22:21:55 +00002530 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002532 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002533 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002534 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002535
2536 if (byteorder == -1) {
2537 /* force LE */
2538 ihi = 1;
2539 ilo = 0;
2540 }
2541 else if (byteorder == 1) {
2542 /* force BE */
2543 ihi = 0;
2544 ilo = 1;
2545 }
2546
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002547 while (size-- > 0) {
2548 Py_UNICODE ch = *s++;
2549 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002550#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002551 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002552 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2553 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002555#endif
Tim Peters772747b2001-08-09 22:21:55 +00002556 STORECHAR(ch);
2557 if (ch2)
2558 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002561#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562}
2563
2564PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2565{
2566 if (!PyUnicode_Check(unicode)) {
2567 PyErr_BadArgument();
2568 return NULL;
2569 }
2570 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2571 PyUnicode_GET_SIZE(unicode),
2572 NULL,
2573 0);
2574}
2575
2576/* --- Unicode Escape Codec ----------------------------------------------- */
2577
Fredrik Lundh06d12682001-01-24 07:59:11 +00002578static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002579
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002581 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 const char *errors)
2583{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002584 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002585 Py_ssize_t startinpos;
2586 Py_ssize_t endinpos;
2587 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002588 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002590 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002592 char* message;
2593 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002594 PyObject *errorHandler = NULL;
2595 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002596
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 /* Escaped strings will always be longer than the resulting
2598 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002599 length after conversion to the true value.
2600 (but if the error callback returns a long replacement string
2601 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 v = _PyUnicode_New(size);
2603 if (v == NULL)
2604 goto onError;
2605 if (size == 0)
2606 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002607
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002608 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002610
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611 while (s < end) {
2612 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002613 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002614 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615
2616 /* Non-escape characters are interpreted as Unicode ordinals */
2617 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002618 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 continue;
2620 }
2621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002622 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623 /* \ - Escapes */
2624 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002625 c = *s++;
2626 if (s > end)
2627 c = '\0'; /* Invalid after \ */
2628 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629
2630 /* \x escapes */
2631 case '\n': break;
2632 case '\\': *p++ = '\\'; break;
2633 case '\'': *p++ = '\''; break;
2634 case '\"': *p++ = '\"'; break;
2635 case 'b': *p++ = '\b'; break;
2636 case 'f': *p++ = '\014'; break; /* FF */
2637 case 't': *p++ = '\t'; break;
2638 case 'n': *p++ = '\n'; break;
2639 case 'r': *p++ = '\r'; break;
2640 case 'v': *p++ = '\013'; break; /* VT */
2641 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2642
2643 /* \OOO (octal) escapes */
2644 case '0': case '1': case '2': case '3':
2645 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002646 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002647 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002648 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002649 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002650 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002652 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653 break;
2654
Fredrik Lundhccc74732001-02-18 22:13:49 +00002655 /* hex escapes */
2656 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002658 digits = 2;
2659 message = "truncated \\xXX escape";
2660 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661
Fredrik Lundhccc74732001-02-18 22:13:49 +00002662 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002664 digits = 4;
2665 message = "truncated \\uXXXX escape";
2666 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002667
Fredrik Lundhccc74732001-02-18 22:13:49 +00002668 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002669 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002670 digits = 8;
2671 message = "truncated \\UXXXXXXXX escape";
2672 hexescape:
2673 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 outpos = p-PyUnicode_AS_UNICODE(v);
2675 if (s+digits>end) {
2676 endinpos = size;
2677 if (unicode_decode_call_errorhandler(
2678 errors, &errorHandler,
2679 "unicodeescape", "end of string in escape sequence",
2680 starts, size, &startinpos, &endinpos, &exc, &s,
2681 (PyObject **)&v, &outpos, &p))
2682 goto onError;
2683 goto nextByte;
2684 }
2685 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002686 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002687 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002688 endinpos = (s+i+1)-starts;
2689 if (unicode_decode_call_errorhandler(
2690 errors, &errorHandler,
2691 "unicodeescape", message,
2692 starts, size, &startinpos, &endinpos, &exc, &s,
2693 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002694 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002695 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002696 }
2697 chr = (chr<<4) & ~0xF;
2698 if (c >= '0' && c <= '9')
2699 chr += c - '0';
2700 else if (c >= 'a' && c <= 'f')
2701 chr += 10 + c - 'a';
2702 else
2703 chr += 10 + c - 'A';
2704 }
2705 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002706 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 /* _decoding_error will have already written into the
2708 target buffer. */
2709 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002710 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002711 /* when we get here, chr is a 32-bit unicode character */
2712 if (chr <= 0xffff)
2713 /* UCS-2 character */
2714 *p++ = (Py_UNICODE) chr;
2715 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002716 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002717 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002718#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002719 *p++ = chr;
2720#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002721 chr -= 0x10000L;
2722 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002723 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002724#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002725 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002726 endinpos = s-starts;
2727 outpos = p-PyUnicode_AS_UNICODE(v);
2728 if (unicode_decode_call_errorhandler(
2729 errors, &errorHandler,
2730 "unicodeescape", "illegal Unicode character",
2731 starts, size, &startinpos, &endinpos, &exc, &s,
2732 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002733 goto onError;
2734 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002735 break;
2736
2737 /* \N{name} */
2738 case 'N':
2739 message = "malformed \\N character escape";
2740 if (ucnhash_CAPI == NULL) {
2741 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002742 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002743 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002744 if (m == NULL)
2745 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002746 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002747 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002748 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002749 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002750 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002751 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002752 if (ucnhash_CAPI == NULL)
2753 goto ucnhashError;
2754 }
2755 if (*s == '{') {
2756 const char *start = s+1;
2757 /* look for the closing brace */
2758 while (*s != '}' && s < end)
2759 s++;
2760 if (s > start && s < end && *s == '}') {
2761 /* found a name. look it up in the unicode database */
2762 message = "unknown Unicode character name";
2763 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002764 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002765 goto store;
2766 }
2767 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 endinpos = s-starts;
2769 outpos = p-PyUnicode_AS_UNICODE(v);
2770 if (unicode_decode_call_errorhandler(
2771 errors, &errorHandler,
2772 "unicodeescape", message,
2773 starts, size, &startinpos, &endinpos, &exc, &s,
2774 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002775 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002776 break;
2777
2778 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002779 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002780 message = "\\ at end of string";
2781 s--;
2782 endinpos = s-starts;
2783 outpos = p-PyUnicode_AS_UNICODE(v);
2784 if (unicode_decode_call_errorhandler(
2785 errors, &errorHandler,
2786 "unicodeescape", message,
2787 starts, size, &startinpos, &endinpos, &exc, &s,
2788 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002789 goto onError;
2790 }
2791 else {
2792 *p++ = '\\';
2793 *p++ = (unsigned char)s[-1];
2794 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002795 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002797 nextByte:
2798 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002800 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002801 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002802 Py_XDECREF(errorHandler);
2803 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002805
Fredrik Lundhccc74732001-02-18 22:13:49 +00002806ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002807 PyErr_SetString(
2808 PyExc_UnicodeError,
2809 "\\N escapes not supported (can't load unicodedata module)"
2810 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002811 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002812 Py_XDECREF(errorHandler);
2813 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002814 return NULL;
2815
Fredrik Lundhccc74732001-02-18 22:13:49 +00002816onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002818 Py_XDECREF(errorHandler);
2819 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820 return NULL;
2821}
2822
2823/* Return a Unicode-Escape string version of the Unicode object.
2824
2825 If quotes is true, the string is enclosed in u"" or u'' quotes as
2826 appropriate.
2827
2828*/
2829
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002830Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002831 Py_ssize_t size,
2832 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002833{
2834 /* like wcschr, but doesn't stop at NULL characters */
2835
2836 while (size-- > 0) {
2837 if (*s == ch)
2838 return s;
2839 s++;
2840 }
2841
2842 return NULL;
2843}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002844
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845static
2846PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002847 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 int quotes)
2849{
2850 PyObject *repr;
2851 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002853 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854
Neal Norwitz17753ec2006-08-21 22:21:19 +00002855 /* XXX(nnorwitz): rather than over-allocating, it would be
2856 better to choose a different scheme. Perhaps scan the
2857 first N-chars of the string and allocate based on that size.
2858 */
2859 /* Initial allocation is based on the longest-possible unichr
2860 escape.
2861
2862 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2863 unichr, so in this case it's the longest unichr escape. In
2864 narrow (UTF-16) builds this is five chars per source unichr
2865 since there are two unichrs in the surrogate pair, so in narrow
2866 (UTF-16) builds it's not the longest unichr escape.
2867
2868 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2869 so in the narrow (UTF-16) build case it's the longest unichr
2870 escape.
2871 */
2872
2873 repr = PyString_FromStringAndSize(NULL,
2874 2
2875#ifdef Py_UNICODE_WIDE
2876 + 10*size
2877#else
2878 + 6*size
2879#endif
2880 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881 if (repr == NULL)
2882 return NULL;
2883
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002884 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885
2886 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002888 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 !findchar(s, size, '"')) ? '"' : '\'';
2890 }
2891 while (size-- > 0) {
2892 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002893
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002894 /* Escape quotes and backslashes */
2895 if ((quotes &&
2896 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897 *p++ = '\\';
2898 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002899 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002900 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002901
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002902#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002903 /* Map 21-bit characters to '\U00xxxxxx' */
2904 else if (ch >= 0x10000) {
2905 *p++ = '\\';
2906 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002907 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2908 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2909 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2910 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2911 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2912 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2913 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002914 *p++ = hexdigit[ch & 0x0000000F];
2915 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002916 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002917#else
2918 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002919 else if (ch >= 0xD800 && ch < 0xDC00) {
2920 Py_UNICODE ch2;
2921 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002922
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002923 ch2 = *s++;
2924 size--;
2925 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2926 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2927 *p++ = '\\';
2928 *p++ = 'U';
2929 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2930 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2931 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2932 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2933 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2934 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2935 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2936 *p++ = hexdigit[ucs & 0x0000000F];
2937 continue;
2938 }
2939 /* Fall through: isolated surrogates are copied as-is */
2940 s--;
2941 size++;
2942 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002943#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002944
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002946 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002947 *p++ = '\\';
2948 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002949 *p++ = hexdigit[(ch >> 12) & 0x000F];
2950 *p++ = hexdigit[(ch >> 8) & 0x000F];
2951 *p++ = hexdigit[(ch >> 4) & 0x000F];
2952 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002953 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002954
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002955 /* Map special whitespace to '\t', \n', '\r' */
2956 else if (ch == '\t') {
2957 *p++ = '\\';
2958 *p++ = 't';
2959 }
2960 else if (ch == '\n') {
2961 *p++ = '\\';
2962 *p++ = 'n';
2963 }
2964 else if (ch == '\r') {
2965 *p++ = '\\';
2966 *p++ = 'r';
2967 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002968
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002969 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002970 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002972 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002973 *p++ = hexdigit[(ch >> 4) & 0x000F];
2974 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002975 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002976
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 /* Copy everything else as-is */
2978 else
2979 *p++ = (char) ch;
2980 }
2981 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002982 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983
2984 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002985 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 return repr;
2987}
2988
2989PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002990 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991{
2992 return unicodeescape_string(s, size, 0);
2993}
2994
2995PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2996{
2997 if (!PyUnicode_Check(unicode)) {
2998 PyErr_BadArgument();
2999 return NULL;
3000 }
3001 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3002 PyUnicode_GET_SIZE(unicode));
3003}
3004
3005/* --- Raw Unicode Escape Codec ------------------------------------------- */
3006
3007PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003008 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009 const char *errors)
3010{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003011 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003012 Py_ssize_t startinpos;
3013 Py_ssize_t endinpos;
3014 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003016 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 const char *end;
3018 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003019 PyObject *errorHandler = NULL;
3020 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003021
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022 /* Escaped strings will always be longer than the resulting
3023 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003024 length after conversion to the true value. (But decoding error
3025 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026 v = _PyUnicode_New(size);
3027 if (v == NULL)
3028 goto onError;
3029 if (size == 0)
3030 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 end = s + size;
3033 while (s < end) {
3034 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003035 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003037 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038
3039 /* Non-escape characters are interpreted as Unicode ordinals */
3040 if (*s != '\\') {
3041 *p++ = (unsigned char)*s++;
3042 continue;
3043 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003044 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045
3046 /* \u-escapes are only interpreted iff the number of leading
3047 backslashes if odd */
3048 bs = s;
3049 for (;s < end;) {
3050 if (*s != '\\')
3051 break;
3052 *p++ = (unsigned char)*s++;
3053 }
3054 if (((s - bs) & 1) == 0 ||
3055 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003056 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 continue;
3058 }
3059 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003060 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 s++;
3062
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003063 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003065 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068 endinpos = s-starts;
3069 if (unicode_decode_call_errorhandler(
3070 errors, &errorHandler,
3071 "rawunicodeescape", "truncated \\uXXXX",
3072 starts, size, &startinpos, &endinpos, &exc, &s,
3073 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003075 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 }
3077 x = (x<<4) & ~0xF;
3078 if (c >= '0' && c <= '9')
3079 x += c - '0';
3080 else if (c >= 'a' && c <= 'f')
3081 x += 10 + c - 'a';
3082 else
3083 x += 10 + c - 'A';
3084 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003085#ifndef Py_UNICODE_WIDE
3086 if (x > 0x10000) {
3087 if (unicode_decode_call_errorhandler(
3088 errors, &errorHandler,
3089 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3090 starts, size, &startinpos, &endinpos, &exc, &s,
3091 (PyObject **)&v, &outpos, &p))
3092 goto onError;
3093 }
3094#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003095 *p++ = x;
3096 nextByte:
3097 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003099 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003100 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003101 Py_XDECREF(errorHandler);
3102 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003104
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 onError:
3106 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107 Py_XDECREF(errorHandler);
3108 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003109 return NULL;
3110}
3111
3112PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003113 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114{
3115 PyObject *repr;
3116 char *p;
3117 char *q;
3118
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003119 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003121#ifdef Py_UNICODE_WIDE
3122 repr = PyString_FromStringAndSize(NULL, 10 * size);
3123#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003125#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 if (repr == NULL)
3127 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003128 if (size == 0)
3129 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130
3131 p = q = PyString_AS_STRING(repr);
3132 while (size-- > 0) {
3133 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003134#ifdef Py_UNICODE_WIDE
3135 /* Map 32-bit characters to '\Uxxxxxxxx' */
3136 if (ch >= 0x10000) {
3137 *p++ = '\\';
3138 *p++ = 'U';
3139 *p++ = hexdigit[(ch >> 28) & 0xf];
3140 *p++ = hexdigit[(ch >> 24) & 0xf];
3141 *p++ = hexdigit[(ch >> 20) & 0xf];
3142 *p++ = hexdigit[(ch >> 16) & 0xf];
3143 *p++ = hexdigit[(ch >> 12) & 0xf];
3144 *p++ = hexdigit[(ch >> 8) & 0xf];
3145 *p++ = hexdigit[(ch >> 4) & 0xf];
3146 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003147 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003148 else
3149#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150 /* Map 16-bit characters to '\uxxxx' */
3151 if (ch >= 256) {
3152 *p++ = '\\';
3153 *p++ = 'u';
3154 *p++ = hexdigit[(ch >> 12) & 0xf];
3155 *p++ = hexdigit[(ch >> 8) & 0xf];
3156 *p++ = hexdigit[(ch >> 4) & 0xf];
3157 *p++ = hexdigit[ch & 15];
3158 }
3159 /* Copy everything else as-is */
3160 else
3161 *p++ = (char) ch;
3162 }
3163 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00003164 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 return repr;
3166}
3167
3168PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3169{
3170 if (!PyUnicode_Check(unicode)) {
3171 PyErr_BadArgument();
3172 return NULL;
3173 }
3174 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3175 PyUnicode_GET_SIZE(unicode));
3176}
3177
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003178/* --- Unicode Internal Codec ------------------------------------------- */
3179
3180PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003181 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003182 const char *errors)
3183{
3184 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003185 Py_ssize_t startinpos;
3186 Py_ssize_t endinpos;
3187 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003188 PyUnicodeObject *v;
3189 Py_UNICODE *p;
3190 const char *end;
3191 const char *reason;
3192 PyObject *errorHandler = NULL;
3193 PyObject *exc = NULL;
3194
Neal Norwitzd43069c2006-01-08 01:12:10 +00003195#ifdef Py_UNICODE_WIDE
3196 Py_UNICODE unimax = PyUnicode_GetMax();
3197#endif
3198
Armin Rigo7ccbca92006-10-04 12:17:45 +00003199 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003200 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3201 if (v == NULL)
3202 goto onError;
3203 if (PyUnicode_GetSize((PyObject *)v) == 0)
3204 return (PyObject *)v;
3205 p = PyUnicode_AS_UNICODE(v);
3206 end = s + size;
3207
3208 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003209 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003210 /* We have to sanity check the raw data, otherwise doom looms for
3211 some malformed UCS-4 data. */
3212 if (
3213 #ifdef Py_UNICODE_WIDE
3214 *p > unimax || *p < 0 ||
3215 #endif
3216 end-s < Py_UNICODE_SIZE
3217 )
3218 {
3219 startinpos = s - starts;
3220 if (end-s < Py_UNICODE_SIZE) {
3221 endinpos = end-starts;
3222 reason = "truncated input";
3223 }
3224 else {
3225 endinpos = s - starts + Py_UNICODE_SIZE;
3226 reason = "illegal code point (> 0x10FFFF)";
3227 }
3228 outpos = p - PyUnicode_AS_UNICODE(v);
3229 if (unicode_decode_call_errorhandler(
3230 errors, &errorHandler,
3231 "unicode_internal", reason,
3232 starts, size, &startinpos, &endinpos, &exc, &s,
3233 (PyObject **)&v, &outpos, &p)) {
3234 goto onError;
3235 }
3236 }
3237 else {
3238 p++;
3239 s += Py_UNICODE_SIZE;
3240 }
3241 }
3242
Martin v. Löwis412fb672006-04-13 06:34:32 +00003243 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003244 goto onError;
3245 Py_XDECREF(errorHandler);
3246 Py_XDECREF(exc);
3247 return (PyObject *)v;
3248
3249 onError:
3250 Py_XDECREF(v);
3251 Py_XDECREF(errorHandler);
3252 Py_XDECREF(exc);
3253 return NULL;
3254}
3255
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256/* --- Latin-1 Codec ------------------------------------------------------ */
3257
3258PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003259 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 const char *errors)
3261{
3262 PyUnicodeObject *v;
3263 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003264
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003266 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003267 Py_UNICODE r = *(unsigned char*)s;
3268 return PyUnicode_FromUnicode(&r, 1);
3269 }
3270
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 v = _PyUnicode_New(size);
3272 if (v == NULL)
3273 goto onError;
3274 if (size == 0)
3275 return (PyObject *)v;
3276 p = PyUnicode_AS_UNICODE(v);
3277 while (size-- > 0)
3278 *p++ = (unsigned char)*s++;
3279 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003280
Guido van Rossumd57fd912000-03-10 22:53:23 +00003281 onError:
3282 Py_XDECREF(v);
3283 return NULL;
3284}
3285
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003286/* create or adjust a UnicodeEncodeError */
3287static void make_encode_exception(PyObject **exceptionObject,
3288 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003289 const Py_UNICODE *unicode, Py_ssize_t size,
3290 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003293 if (*exceptionObject == NULL) {
3294 *exceptionObject = PyUnicodeEncodeError_Create(
3295 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296 }
3297 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3299 goto onError;
3300 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3301 goto onError;
3302 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3303 goto onError;
3304 return;
3305 onError:
3306 Py_DECREF(*exceptionObject);
3307 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308 }
3309}
3310
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003311/* raises a UnicodeEncodeError */
3312static void raise_encode_exception(PyObject **exceptionObject,
3313 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003314 const Py_UNICODE *unicode, Py_ssize_t size,
3315 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003316 const char *reason)
3317{
3318 make_encode_exception(exceptionObject,
3319 encoding, unicode, size, startpos, endpos, reason);
3320 if (*exceptionObject != NULL)
3321 PyCodec_StrictErrors(*exceptionObject);
3322}
3323
3324/* error handling callback helper:
3325 build arguments, call the callback and check the arguments,
3326 put the result into newpos and return the replacement string, which
3327 has to be freed by the caller */
3328static PyObject *unicode_encode_call_errorhandler(const char *errors,
3329 PyObject **errorHandler,
3330 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003331 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3332 Py_ssize_t startpos, Py_ssize_t endpos,
3333 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003334{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003335 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336
3337 PyObject *restuple;
3338 PyObject *resunicode;
3339
3340 if (*errorHandler == NULL) {
3341 *errorHandler = PyCodec_LookupError(errors);
3342 if (*errorHandler == NULL)
3343 return NULL;
3344 }
3345
3346 make_encode_exception(exceptionObject,
3347 encoding, unicode, size, startpos, endpos, reason);
3348 if (*exceptionObject == NULL)
3349 return NULL;
3350
3351 restuple = PyObject_CallFunctionObjArgs(
3352 *errorHandler, *exceptionObject, NULL);
3353 if (restuple == NULL)
3354 return NULL;
3355 if (!PyTuple_Check(restuple)) {
3356 PyErr_Format(PyExc_TypeError, &argparse[4]);
3357 Py_DECREF(restuple);
3358 return NULL;
3359 }
3360 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3361 &resunicode, newpos)) {
3362 Py_DECREF(restuple);
3363 return NULL;
3364 }
3365 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003366 *newpos = size+*newpos;
3367 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003368 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003369 Py_DECREF(restuple);
3370 return NULL;
3371 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372 Py_INCREF(resunicode);
3373 Py_DECREF(restuple);
3374 return resunicode;
3375}
3376
3377static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003378 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003379 const char *errors,
3380 int limit)
3381{
3382 /* output object */
3383 PyObject *res;
3384 /* pointers to the beginning and end+1 of input */
3385 const Py_UNICODE *startp = p;
3386 const Py_UNICODE *endp = p + size;
3387 /* pointer to the beginning of the unencodable characters */
3388 /* const Py_UNICODE *badp = NULL; */
3389 /* pointer into the output */
3390 char *str;
3391 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003392 Py_ssize_t respos = 0;
3393 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003394 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3395 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396 PyObject *errorHandler = NULL;
3397 PyObject *exc = NULL;
3398 /* the following variable is used for caching string comparisons
3399 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3400 int known_errorHandler = -1;
3401
3402 /* allocate enough for a simple encoding without
3403 replacements, if we need more, we'll resize */
3404 res = PyString_FromStringAndSize(NULL, size);
3405 if (res == NULL)
3406 goto onError;
3407 if (size == 0)
3408 return res;
3409 str = PyString_AS_STRING(res);
3410 ressize = size;
3411
3412 while (p<endp) {
3413 Py_UNICODE c = *p;
3414
3415 /* can we encode this? */
3416 if (c<limit) {
3417 /* no overflow check, because we know that the space is enough */
3418 *str++ = (char)c;
3419 ++p;
3420 }
3421 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003422 Py_ssize_t unicodepos = p-startp;
3423 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003424 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003425 Py_ssize_t repsize;
3426 Py_ssize_t newpos;
3427 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 Py_UNICODE *uni2;
3429 /* startpos for collecting unencodable chars */
3430 const Py_UNICODE *collstart = p;
3431 const Py_UNICODE *collend = p;
3432 /* find all unecodable characters */
3433 while ((collend < endp) && ((*collend)>=limit))
3434 ++collend;
3435 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3436 if (known_errorHandler==-1) {
3437 if ((errors==NULL) || (!strcmp(errors, "strict")))
3438 known_errorHandler = 1;
3439 else if (!strcmp(errors, "replace"))
3440 known_errorHandler = 2;
3441 else if (!strcmp(errors, "ignore"))
3442 known_errorHandler = 3;
3443 else if (!strcmp(errors, "xmlcharrefreplace"))
3444 known_errorHandler = 4;
3445 else
3446 known_errorHandler = 0;
3447 }
3448 switch (known_errorHandler) {
3449 case 1: /* strict */
3450 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3451 goto onError;
3452 case 2: /* replace */
3453 while (collstart++<collend)
3454 *str++ = '?'; /* fall through */
3455 case 3: /* ignore */
3456 p = collend;
3457 break;
3458 case 4: /* xmlcharrefreplace */
3459 respos = str-PyString_AS_STRING(res);
3460 /* determine replacement size (temporarily (mis)uses p) */
3461 for (p = collstart, repsize = 0; p < collend; ++p) {
3462 if (*p<10)
3463 repsize += 2+1+1;
3464 else if (*p<100)
3465 repsize += 2+2+1;
3466 else if (*p<1000)
3467 repsize += 2+3+1;
3468 else if (*p<10000)
3469 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003470#ifndef Py_UNICODE_WIDE
3471 else
3472 repsize += 2+5+1;
3473#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003474 else if (*p<100000)
3475 repsize += 2+5+1;
3476 else if (*p<1000000)
3477 repsize += 2+6+1;
3478 else
3479 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003480#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003481 }
3482 requiredsize = respos+repsize+(endp-collend);
3483 if (requiredsize > ressize) {
3484 if (requiredsize<2*ressize)
3485 requiredsize = 2*ressize;
3486 if (_PyString_Resize(&res, requiredsize))
3487 goto onError;
3488 str = PyString_AS_STRING(res) + respos;
3489 ressize = requiredsize;
3490 }
3491 /* generate replacement (temporarily (mis)uses p) */
3492 for (p = collstart; p < collend; ++p) {
3493 str += sprintf(str, "&#%d;", (int)*p);
3494 }
3495 p = collend;
3496 break;
3497 default:
3498 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3499 encoding, reason, startp, size, &exc,
3500 collstart-startp, collend-startp, &newpos);
3501 if (repunicode == NULL)
3502 goto onError;
3503 /* need more space? (at least enough for what we
3504 have+the replacement+the rest of the string, so
3505 we won't have to check space for encodable characters) */
3506 respos = str-PyString_AS_STRING(res);
3507 repsize = PyUnicode_GET_SIZE(repunicode);
3508 requiredsize = respos+repsize+(endp-collend);
3509 if (requiredsize > ressize) {
3510 if (requiredsize<2*ressize)
3511 requiredsize = 2*ressize;
3512 if (_PyString_Resize(&res, requiredsize)) {
3513 Py_DECREF(repunicode);
3514 goto onError;
3515 }
3516 str = PyString_AS_STRING(res) + respos;
3517 ressize = requiredsize;
3518 }
3519 /* check if there is anything unencodable in the replacement
3520 and copy it to the output */
3521 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3522 c = *uni2;
3523 if (c >= limit) {
3524 raise_encode_exception(&exc, encoding, startp, size,
3525 unicodepos, unicodepos+1, reason);
3526 Py_DECREF(repunicode);
3527 goto onError;
3528 }
3529 *str = (char)c;
3530 }
3531 p = startp + newpos;
3532 Py_DECREF(repunicode);
3533 }
3534 }
3535 }
3536 /* Resize if we allocated to much */
3537 respos = str-PyString_AS_STRING(res);
3538 if (respos<ressize)
3539 /* If this falls res will be NULL */
3540 _PyString_Resize(&res, respos);
3541 Py_XDECREF(errorHandler);
3542 Py_XDECREF(exc);
3543 return res;
3544
3545 onError:
3546 Py_XDECREF(res);
3547 Py_XDECREF(errorHandler);
3548 Py_XDECREF(exc);
3549 return NULL;
3550}
3551
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003553 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 const char *errors)
3555{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557}
3558
3559PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3560{
3561 if (!PyUnicode_Check(unicode)) {
3562 PyErr_BadArgument();
3563 return NULL;
3564 }
3565 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3566 PyUnicode_GET_SIZE(unicode),
3567 NULL);
3568}
3569
3570/* --- 7-bit ASCII Codec -------------------------------------------------- */
3571
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003573 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574 const char *errors)
3575{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577 PyUnicodeObject *v;
3578 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003579 Py_ssize_t startinpos;
3580 Py_ssize_t endinpos;
3581 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 const char *e;
3583 PyObject *errorHandler = NULL;
3584 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003585
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003587 if (size == 1 && *(unsigned char*)s < 128) {
3588 Py_UNICODE r = *(unsigned char*)s;
3589 return PyUnicode_FromUnicode(&r, 1);
3590 }
Tim Petersced69f82003-09-16 20:30:58 +00003591
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 v = _PyUnicode_New(size);
3593 if (v == NULL)
3594 goto onError;
3595 if (size == 0)
3596 return (PyObject *)v;
3597 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 e = s + size;
3599 while (s < e) {
3600 register unsigned char c = (unsigned char)*s;
3601 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 ++s;
3604 }
3605 else {
3606 startinpos = s-starts;
3607 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003608 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 if (unicode_decode_call_errorhandler(
3610 errors, &errorHandler,
3611 "ascii", "ordinal not in range(128)",
3612 starts, size, &startinpos, &endinpos, &exc, &s,
3613 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003617 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003618 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003619 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 Py_XDECREF(errorHandler);
3621 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003623
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 onError:
3625 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003626 Py_XDECREF(errorHandler);
3627 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 return NULL;
3629}
3630
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003632 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633 const char *errors)
3634{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636}
3637
3638PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3639{
3640 if (!PyUnicode_Check(unicode)) {
3641 PyErr_BadArgument();
3642 return NULL;
3643 }
3644 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3645 PyUnicode_GET_SIZE(unicode),
3646 NULL);
3647}
3648
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003649#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003650
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003651/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003652
Martin v. Löwisd8251432006-06-14 05:21:04 +00003653#if SIZEOF_INT < SIZEOF_SSIZE_T
3654#define NEED_RETRY
3655#endif
3656
3657/* XXX This code is limited to "true" double-byte encodings, as
3658 a) it assumes an incomplete character consists of a single byte, and
3659 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3660 encodings, see IsDBCSLeadByteEx documentation. */
3661
3662static int is_dbcs_lead_byte(const char *s, int offset)
3663{
3664 const char *curr = s + offset;
3665
3666 if (IsDBCSLeadByte(*curr)) {
3667 const char *prev = CharPrev(s, curr);
3668 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3669 }
3670 return 0;
3671}
3672
3673/*
3674 * Decode MBCS string into unicode object. If 'final' is set, converts
3675 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3676 */
3677static int decode_mbcs(PyUnicodeObject **v,
3678 const char *s, /* MBCS string */
3679 int size, /* sizeof MBCS string */
3680 int final)
3681{
3682 Py_UNICODE *p;
3683 Py_ssize_t n = 0;
3684 int usize = 0;
3685
3686 assert(size >= 0);
3687
3688 /* Skip trailing lead-byte unless 'final' is set */
3689 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3690 --size;
3691
3692 /* First get the size of the result */
3693 if (size > 0) {
3694 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3695 if (usize == 0) {
3696 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3697 return -1;
3698 }
3699 }
3700
3701 if (*v == NULL) {
3702 /* Create unicode object */
3703 *v = _PyUnicode_New(usize);
3704 if (*v == NULL)
3705 return -1;
3706 }
3707 else {
3708 /* Extend unicode object */
3709 n = PyUnicode_GET_SIZE(*v);
3710 if (_PyUnicode_Resize(v, n + usize) < 0)
3711 return -1;
3712 }
3713
3714 /* Do the conversion */
3715 if (size > 0) {
3716 p = PyUnicode_AS_UNICODE(*v) + n;
3717 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3718 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3719 return -1;
3720 }
3721 }
3722
3723 return size;
3724}
3725
3726PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3727 Py_ssize_t size,
3728 const char *errors,
3729 Py_ssize_t *consumed)
3730{
3731 PyUnicodeObject *v = NULL;
3732 int done;
3733
3734 if (consumed)
3735 *consumed = 0;
3736
3737#ifdef NEED_RETRY
3738 retry:
3739 if (size > INT_MAX)
3740 done = decode_mbcs(&v, s, INT_MAX, 0);
3741 else
3742#endif
3743 done = decode_mbcs(&v, s, (int)size, !consumed);
3744
3745 if (done < 0) {
3746 Py_XDECREF(v);
3747 return NULL;
3748 }
3749
3750 if (consumed)
3751 *consumed += done;
3752
3753#ifdef NEED_RETRY
3754 if (size > INT_MAX) {
3755 s += done;
3756 size -= done;
3757 goto retry;
3758 }
3759#endif
3760
3761 return (PyObject *)v;
3762}
3763
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003764PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003765 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003766 const char *errors)
3767{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003768 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3769}
3770
3771/*
3772 * Convert unicode into string object (MBCS).
3773 * Returns 0 if succeed, -1 otherwise.
3774 */
3775static int encode_mbcs(PyObject **repr,
3776 const Py_UNICODE *p, /* unicode */
3777 int size) /* size of unicode */
3778{
3779 int mbcssize = 0;
3780 Py_ssize_t n = 0;
3781
3782 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003783
3784 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003785 if (size > 0) {
3786 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3787 if (mbcssize == 0) {
3788 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3789 return -1;
3790 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003791 }
3792
Martin v. Löwisd8251432006-06-14 05:21:04 +00003793 if (*repr == NULL) {
3794 /* Create string object */
3795 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3796 if (*repr == NULL)
3797 return -1;
3798 }
3799 else {
3800 /* Extend string object */
3801 n = PyString_Size(*repr);
3802 if (_PyString_Resize(repr, n + mbcssize) < 0)
3803 return -1;
3804 }
3805
3806 /* Do the conversion */
3807 if (size > 0) {
3808 char *s = PyString_AS_STRING(*repr) + n;
3809 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3810 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3811 return -1;
3812 }
3813 }
3814
3815 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003816}
3817
3818PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003819 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003820 const char *errors)
3821{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003822 PyObject *repr = NULL;
3823 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003824
Martin v. Löwisd8251432006-06-14 05:21:04 +00003825#ifdef NEED_RETRY
3826 retry:
3827 if (size > INT_MAX)
3828 ret = encode_mbcs(&repr, p, INT_MAX);
3829 else
3830#endif
3831 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003832
Martin v. Löwisd8251432006-06-14 05:21:04 +00003833 if (ret < 0) {
3834 Py_XDECREF(repr);
3835 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003836 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003837
3838#ifdef NEED_RETRY
3839 if (size > INT_MAX) {
3840 p += INT_MAX;
3841 size -= INT_MAX;
3842 goto retry;
3843 }
3844#endif
3845
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003846 return repr;
3847}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003848
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003849PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3850{
3851 if (!PyUnicode_Check(unicode)) {
3852 PyErr_BadArgument();
3853 return NULL;
3854 }
3855 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3856 PyUnicode_GET_SIZE(unicode),
3857 NULL);
3858}
3859
Martin v. Löwisd8251432006-06-14 05:21:04 +00003860#undef NEED_RETRY
3861
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003862#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003863
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864/* --- Character Mapping Codec -------------------------------------------- */
3865
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003867 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868 PyObject *mapping,
3869 const char *errors)
3870{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003871 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003872 Py_ssize_t startinpos;
3873 Py_ssize_t endinpos;
3874 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003875 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876 PyUnicodeObject *v;
3877 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003878 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003879 PyObject *errorHandler = NULL;
3880 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003881 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003882 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003883
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 /* Default to Latin-1 */
3885 if (mapping == NULL)
3886 return PyUnicode_DecodeLatin1(s, size, errors);
3887
3888 v = _PyUnicode_New(size);
3889 if (v == NULL)
3890 goto onError;
3891 if (size == 0)
3892 return (PyObject *)v;
3893 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003894 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003895 if (PyUnicode_CheckExact(mapping)) {
3896 mapstring = PyUnicode_AS_UNICODE(mapping);
3897 maplen = PyUnicode_GET_SIZE(mapping);
3898 while (s < e) {
3899 unsigned char ch = *s;
3900 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003902 if (ch < maplen)
3903 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003905 if (x == 0xfffe) {
3906 /* undefined mapping */
3907 outpos = p-PyUnicode_AS_UNICODE(v);
3908 startinpos = s-starts;
3909 endinpos = startinpos+1;
3910 if (unicode_decode_call_errorhandler(
3911 errors, &errorHandler,
3912 "charmap", "character maps to <undefined>",
3913 starts, size, &startinpos, &endinpos, &exc, &s,
3914 (PyObject **)&v, &outpos, &p)) {
3915 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003916 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003917 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003918 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003919 *p++ = x;
3920 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003922 }
3923 else {
3924 while (s < e) {
3925 unsigned char ch = *s;
3926 PyObject *w, *x;
3927
3928 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3929 w = PyInt_FromLong((long)ch);
3930 if (w == NULL)
3931 goto onError;
3932 x = PyObject_GetItem(mapping, w);
3933 Py_DECREF(w);
3934 if (x == NULL) {
3935 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3936 /* No mapping found means: mapping is undefined. */
3937 PyErr_Clear();
3938 x = Py_None;
3939 Py_INCREF(x);
3940 } else
3941 goto onError;
3942 }
3943
3944 /* Apply mapping */
3945 if (PyInt_Check(x)) {
3946 long value = PyInt_AS_LONG(x);
3947 if (value < 0 || value > 65535) {
3948 PyErr_SetString(PyExc_TypeError,
3949 "character mapping must be in range(65536)");
3950 Py_DECREF(x);
3951 goto onError;
3952 }
3953 *p++ = (Py_UNICODE)value;
3954 }
3955 else if (x == Py_None) {
3956 /* undefined mapping */
3957 outpos = p-PyUnicode_AS_UNICODE(v);
3958 startinpos = s-starts;
3959 endinpos = startinpos+1;
3960 if (unicode_decode_call_errorhandler(
3961 errors, &errorHandler,
3962 "charmap", "character maps to <undefined>",
3963 starts, size, &startinpos, &endinpos, &exc, &s,
3964 (PyObject **)&v, &outpos, &p)) {
3965 Py_DECREF(x);
3966 goto onError;
3967 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003968 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003969 continue;
3970 }
3971 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003972 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003973
3974 if (targetsize == 1)
3975 /* 1-1 mapping */
3976 *p++ = *PyUnicode_AS_UNICODE(x);
3977
3978 else if (targetsize > 1) {
3979 /* 1-n mapping */
3980 if (targetsize > extrachars) {
3981 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003982 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3983 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003984 (targetsize << 2);
3985 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00003986 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003987 if (_PyUnicode_Resize(&v,
3988 PyUnicode_GET_SIZE(v) + needed) < 0) {
3989 Py_DECREF(x);
3990 goto onError;
3991 }
3992 p = PyUnicode_AS_UNICODE(v) + oldpos;
3993 }
3994 Py_UNICODE_COPY(p,
3995 PyUnicode_AS_UNICODE(x),
3996 targetsize);
3997 p += targetsize;
3998 extrachars -= targetsize;
3999 }
4000 /* 1-0 mapping: skip the character */
4001 }
4002 else {
4003 /* wrong return value */
4004 PyErr_SetString(PyExc_TypeError,
4005 "character mapping must return integer, None or unicode");
4006 Py_DECREF(x);
4007 goto onError;
4008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004010 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012 }
4013 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00004014 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016 Py_XDECREF(errorHandler);
4017 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004019
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004021 Py_XDECREF(errorHandler);
4022 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023 Py_XDECREF(v);
4024 return NULL;
4025}
4026
Martin v. Löwis3f767792006-06-04 19:36:28 +00004027/* Charmap encoding: the lookup table */
4028
4029struct encoding_map{
4030 PyObject_HEAD
4031 unsigned char level1[32];
4032 int count2, count3;
4033 unsigned char level23[1];
4034};
4035
4036static PyObject*
4037encoding_map_size(PyObject *obj, PyObject* args)
4038{
4039 struct encoding_map *map = (struct encoding_map*)obj;
4040 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4041 128*map->count3);
4042}
4043
4044static PyMethodDef encoding_map_methods[] = {
4045 {"size", encoding_map_size, METH_NOARGS,
4046 PyDoc_STR("Return the size (in bytes) of this object") },
4047 { 0 }
4048};
4049
4050static void
4051encoding_map_dealloc(PyObject* o)
4052{
4053 PyObject_FREE(o);
4054}
4055
4056static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00004057 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004058 "EncodingMap", /*tp_name*/
4059 sizeof(struct encoding_map), /*tp_basicsize*/
4060 0, /*tp_itemsize*/
4061 /* methods */
4062 encoding_map_dealloc, /*tp_dealloc*/
4063 0, /*tp_print*/
4064 0, /*tp_getattr*/
4065 0, /*tp_setattr*/
4066 0, /*tp_compare*/
4067 0, /*tp_repr*/
4068 0, /*tp_as_number*/
4069 0, /*tp_as_sequence*/
4070 0, /*tp_as_mapping*/
4071 0, /*tp_hash*/
4072 0, /*tp_call*/
4073 0, /*tp_str*/
4074 0, /*tp_getattro*/
4075 0, /*tp_setattro*/
4076 0, /*tp_as_buffer*/
4077 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4078 0, /*tp_doc*/
4079 0, /*tp_traverse*/
4080 0, /*tp_clear*/
4081 0, /*tp_richcompare*/
4082 0, /*tp_weaklistoffset*/
4083 0, /*tp_iter*/
4084 0, /*tp_iternext*/
4085 encoding_map_methods, /*tp_methods*/
4086 0, /*tp_members*/
4087 0, /*tp_getset*/
4088 0, /*tp_base*/
4089 0, /*tp_dict*/
4090 0, /*tp_descr_get*/
4091 0, /*tp_descr_set*/
4092 0, /*tp_dictoffset*/
4093 0, /*tp_init*/
4094 0, /*tp_alloc*/
4095 0, /*tp_new*/
4096 0, /*tp_free*/
4097 0, /*tp_is_gc*/
4098};
4099
4100PyObject*
4101PyUnicode_BuildEncodingMap(PyObject* string)
4102{
4103 Py_UNICODE *decode;
4104 PyObject *result;
4105 struct encoding_map *mresult;
4106 int i;
4107 int need_dict = 0;
4108 unsigned char level1[32];
4109 unsigned char level2[512];
4110 unsigned char *mlevel1, *mlevel2, *mlevel3;
4111 int count2 = 0, count3 = 0;
4112
4113 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4114 PyErr_BadArgument();
4115 return NULL;
4116 }
4117 decode = PyUnicode_AS_UNICODE(string);
4118 memset(level1, 0xFF, sizeof level1);
4119 memset(level2, 0xFF, sizeof level2);
4120
4121 /* If there isn't a one-to-one mapping of NULL to \0,
4122 or if there are non-BMP characters, we need to use
4123 a mapping dictionary. */
4124 if (decode[0] != 0)
4125 need_dict = 1;
4126 for (i = 1; i < 256; i++) {
4127 int l1, l2;
4128 if (decode[i] == 0
4129 #ifdef Py_UNICODE_WIDE
4130 || decode[i] > 0xFFFF
4131 #endif
4132 ) {
4133 need_dict = 1;
4134 break;
4135 }
4136 if (decode[i] == 0xFFFE)
4137 /* unmapped character */
4138 continue;
4139 l1 = decode[i] >> 11;
4140 l2 = decode[i] >> 7;
4141 if (level1[l1] == 0xFF)
4142 level1[l1] = count2++;
4143 if (level2[l2] == 0xFF)
4144 level2[l2] = count3++;
4145 }
4146
4147 if (count2 >= 0xFF || count3 >= 0xFF)
4148 need_dict = 1;
4149
4150 if (need_dict) {
4151 PyObject *result = PyDict_New();
4152 PyObject *key, *value;
4153 if (!result)
4154 return NULL;
4155 for (i = 0; i < 256; i++) {
4156 key = value = NULL;
4157 key = PyInt_FromLong(decode[i]);
4158 value = PyInt_FromLong(i);
4159 if (!key || !value)
4160 goto failed1;
4161 if (PyDict_SetItem(result, key, value) == -1)
4162 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004163 Py_DECREF(key);
4164 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004165 }
4166 return result;
4167 failed1:
4168 Py_XDECREF(key);
4169 Py_XDECREF(value);
4170 Py_DECREF(result);
4171 return NULL;
4172 }
4173
4174 /* Create a three-level trie */
4175 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4176 16*count2 + 128*count3 - 1);
4177 if (!result)
4178 return PyErr_NoMemory();
4179 PyObject_Init(result, &EncodingMapType);
4180 mresult = (struct encoding_map*)result;
4181 mresult->count2 = count2;
4182 mresult->count3 = count3;
4183 mlevel1 = mresult->level1;
4184 mlevel2 = mresult->level23;
4185 mlevel3 = mresult->level23 + 16*count2;
4186 memcpy(mlevel1, level1, 32);
4187 memset(mlevel2, 0xFF, 16*count2);
4188 memset(mlevel3, 0, 128*count3);
4189 count3 = 0;
4190 for (i = 1; i < 256; i++) {
4191 int o1, o2, o3, i2, i3;
4192 if (decode[i] == 0xFFFE)
4193 /* unmapped character */
4194 continue;
4195 o1 = decode[i]>>11;
4196 o2 = (decode[i]>>7) & 0xF;
4197 i2 = 16*mlevel1[o1] + o2;
4198 if (mlevel2[i2] == 0xFF)
4199 mlevel2[i2] = count3++;
4200 o3 = decode[i] & 0x7F;
4201 i3 = 128*mlevel2[i2] + o3;
4202 mlevel3[i3] = i;
4203 }
4204 return result;
4205}
4206
4207static int
4208encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4209{
4210 struct encoding_map *map = (struct encoding_map*)mapping;
4211 int l1 = c>>11;
4212 int l2 = (c>>7) & 0xF;
4213 int l3 = c & 0x7F;
4214 int i;
4215
4216#ifdef Py_UNICODE_WIDE
4217 if (c > 0xFFFF) {
4218 return -1;
4219 }
4220#endif
4221 if (c == 0)
4222 return 0;
4223 /* level 1*/
4224 i = map->level1[l1];
4225 if (i == 0xFF) {
4226 return -1;
4227 }
4228 /* level 2*/
4229 i = map->level23[16*i+l2];
4230 if (i == 0xFF) {
4231 return -1;
4232 }
4233 /* level 3 */
4234 i = map->level23[16*map->count2 + 128*i + l3];
4235 if (i == 0) {
4236 return -1;
4237 }
4238 return i;
4239}
4240
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241/* Lookup the character ch in the mapping. If the character
4242 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004243 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004245{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246 PyObject *w = PyInt_FromLong((long)c);
4247 PyObject *x;
4248
4249 if (w == NULL)
4250 return NULL;
4251 x = PyObject_GetItem(mapping, w);
4252 Py_DECREF(w);
4253 if (x == NULL) {
4254 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4255 /* No mapping found means: mapping is undefined. */
4256 PyErr_Clear();
4257 x = Py_None;
4258 Py_INCREF(x);
4259 return x;
4260 } else
4261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004262 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004263 else if (x == Py_None)
4264 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004265 else if (PyInt_Check(x)) {
4266 long value = PyInt_AS_LONG(x);
4267 if (value < 0 || value > 255) {
4268 PyErr_SetString(PyExc_TypeError,
4269 "character mapping must be in range(256)");
4270 Py_DECREF(x);
4271 return NULL;
4272 }
4273 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004274 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275 else if (PyString_Check(x))
4276 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 /* wrong return value */
4279 PyErr_SetString(PyExc_TypeError,
4280 "character mapping must return integer, None or str");
4281 Py_DECREF(x);
4282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 }
4284}
4285
Martin v. Löwis3f767792006-06-04 19:36:28 +00004286static int
4287charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4288{
4289 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4290 /* exponentially overallocate to minimize reallocations */
4291 if (requiredsize < 2*outsize)
4292 requiredsize = 2*outsize;
4293 if (_PyString_Resize(outobj, requiredsize)) {
4294 return 0;
4295 }
4296 return 1;
4297}
4298
4299typedef enum charmapencode_result {
4300 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4301}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302/* lookup the character, put the result in the output string and adjust
4303 various state variables. Reallocate the output string if not enough
4304 space is available. Return a new reference to the object that
4305 was put in the output buffer, or Py_None, if the mapping was undefined
4306 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004307 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004308static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004309charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004310 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004311{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004312 PyObject *rep;
4313 char *outstart;
4314 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315
Christian Heimese93237d2007-12-19 02:37:44 +00004316 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004317 int res = encoding_map_lookup(c, mapping);
4318 Py_ssize_t requiredsize = *outpos+1;
4319 if (res == -1)
4320 return enc_FAILED;
4321 if (outsize<requiredsize)
4322 if (!charmapencode_resize(outobj, outpos, requiredsize))
4323 return enc_EXCEPTION;
4324 outstart = PyString_AS_STRING(*outobj);
4325 outstart[(*outpos)++] = (char)res;
4326 return enc_SUCCESS;
4327 }
4328
4329 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004330 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00004331 return enc_EXCEPTION;
4332 else if (rep==Py_None) {
4333 Py_DECREF(rep);
4334 return enc_FAILED;
4335 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004337 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004338 if (outsize<requiredsize)
4339 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004340 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004341 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004342 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004343 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004344 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4345 }
4346 else {
4347 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004348 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4349 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004350 if (outsize<requiredsize)
4351 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004352 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004353 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004355 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356 memcpy(outstart + *outpos, repchars, repsize);
4357 *outpos += repsize;
4358 }
4359 }
Georg Brandl9f167602006-06-04 21:46:16 +00004360 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004361 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004362}
4363
4364/* handle an error in PyUnicode_EncodeCharmap
4365 Return 0 on success, -1 on error */
4366static
4367int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004368 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004370 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004371 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372{
4373 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004374 Py_ssize_t repsize;
4375 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004376 Py_UNICODE *uni2;
4377 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004378 Py_ssize_t collstartpos = *inpos;
4379 Py_ssize_t collendpos = *inpos+1;
4380 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 char *encoding = "charmap";
4382 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004383 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385 /* find all unencodable characters */
4386 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004387 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004388 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004389 int res = encoding_map_lookup(p[collendpos], mapping);
4390 if (res != -1)
4391 break;
4392 ++collendpos;
4393 continue;
4394 }
4395
4396 rep = charmapencode_lookup(p[collendpos], mapping);
4397 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004399 else if (rep!=Py_None) {
4400 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401 break;
4402 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004403 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 ++collendpos;
4405 }
4406 /* cache callback name lookup
4407 * (if not done yet, i.e. it's the first error) */
4408 if (*known_errorHandler==-1) {
4409 if ((errors==NULL) || (!strcmp(errors, "strict")))
4410 *known_errorHandler = 1;
4411 else if (!strcmp(errors, "replace"))
4412 *known_errorHandler = 2;
4413 else if (!strcmp(errors, "ignore"))
4414 *known_errorHandler = 3;
4415 else if (!strcmp(errors, "xmlcharrefreplace"))
4416 *known_errorHandler = 4;
4417 else
4418 *known_errorHandler = 0;
4419 }
4420 switch (*known_errorHandler) {
4421 case 1: /* strict */
4422 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4423 return -1;
4424 case 2: /* replace */
4425 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4426 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004427 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428 return -1;
4429 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004430 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4432 return -1;
4433 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 }
4435 /* fall through */
4436 case 3: /* ignore */
4437 *inpos = collendpos;
4438 break;
4439 case 4: /* xmlcharrefreplace */
4440 /* generate replacement (temporarily (mis)uses p) */
4441 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4442 char buffer[2+29+1+1];
4443 char *cp;
4444 sprintf(buffer, "&#%d;", (int)p[collpos]);
4445 for (cp = buffer; *cp; ++cp) {
4446 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004447 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004449 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4451 return -1;
4452 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 }
4454 }
4455 *inpos = collendpos;
4456 break;
4457 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004458 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 encoding, reason, p, size, exceptionObject,
4460 collstartpos, collendpos, &newpos);
4461 if (repunicode == NULL)
4462 return -1;
4463 /* generate replacement */
4464 repsize = PyUnicode_GET_SIZE(repunicode);
4465 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4466 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004467 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468 return -1;
4469 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004470 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4473 return -1;
4474 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475 }
4476 *inpos = newpos;
4477 Py_DECREF(repunicode);
4478 }
4479 return 0;
4480}
4481
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004483 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 PyObject *mapping,
4485 const char *errors)
4486{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487 /* output object */
4488 PyObject *res = NULL;
4489 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004490 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004492 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004493 PyObject *errorHandler = NULL;
4494 PyObject *exc = NULL;
4495 /* the following variable is used for caching string comparisons
4496 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4497 * 3=ignore, 4=xmlcharrefreplace */
4498 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499
4500 /* Default to Latin-1 */
4501 if (mapping == NULL)
4502 return PyUnicode_EncodeLatin1(p, size, errors);
4503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504 /* allocate enough for a simple encoding without
4505 replacements, if we need more, we'll resize */
4506 res = PyString_FromStringAndSize(NULL, size);
4507 if (res == NULL)
4508 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004509 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 while (inpos<size) {
4513 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00004514 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4515 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004517 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518 if (charmap_encoding_error(p, size, &inpos, mapping,
4519 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004520 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00004521 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004522 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004523 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 else
4526 /* done with this character => adjust input position */
4527 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 /* Resize if we allocated to much */
4531 if (respos<PyString_GET_SIZE(res)) {
4532 if (_PyString_Resize(&res, respos))
4533 goto onError;
4534 }
4535 Py_XDECREF(exc);
4536 Py_XDECREF(errorHandler);
4537 return res;
4538
4539 onError:
4540 Py_XDECREF(res);
4541 Py_XDECREF(exc);
4542 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 return NULL;
4544}
4545
4546PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4547 PyObject *mapping)
4548{
4549 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4550 PyErr_BadArgument();
4551 return NULL;
4552 }
4553 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4554 PyUnicode_GET_SIZE(unicode),
4555 mapping,
4556 NULL);
4557}
4558
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559/* create or adjust a UnicodeTranslateError */
4560static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004561 const Py_UNICODE *unicode, Py_ssize_t size,
4562 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004564{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 if (*exceptionObject == NULL) {
4566 *exceptionObject = PyUnicodeTranslateError_Create(
4567 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004568 }
4569 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4571 goto onError;
4572 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4573 goto onError;
4574 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4575 goto onError;
4576 return;
4577 onError:
4578 Py_DECREF(*exceptionObject);
4579 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580 }
4581}
4582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583/* raises a UnicodeTranslateError */
4584static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004585 const Py_UNICODE *unicode, Py_ssize_t size,
4586 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 const char *reason)
4588{
4589 make_translate_exception(exceptionObject,
4590 unicode, size, startpos, endpos, reason);
4591 if (*exceptionObject != NULL)
4592 PyCodec_StrictErrors(*exceptionObject);
4593}
4594
4595/* error handling callback helper:
4596 build arguments, call the callback and check the arguments,
4597 put the result into newpos and return the replacement string, which
4598 has to be freed by the caller */
4599static PyObject *unicode_translate_call_errorhandler(const char *errors,
4600 PyObject **errorHandler,
4601 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004602 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4603 Py_ssize_t startpos, Py_ssize_t endpos,
4604 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004606 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607
Martin v. Löwis412fb672006-04-13 06:34:32 +00004608 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609 PyObject *restuple;
4610 PyObject *resunicode;
4611
4612 if (*errorHandler == NULL) {
4613 *errorHandler = PyCodec_LookupError(errors);
4614 if (*errorHandler == NULL)
4615 return NULL;
4616 }
4617
4618 make_translate_exception(exceptionObject,
4619 unicode, size, startpos, endpos, reason);
4620 if (*exceptionObject == NULL)
4621 return NULL;
4622
4623 restuple = PyObject_CallFunctionObjArgs(
4624 *errorHandler, *exceptionObject, NULL);
4625 if (restuple == NULL)
4626 return NULL;
4627 if (!PyTuple_Check(restuple)) {
4628 PyErr_Format(PyExc_TypeError, &argparse[4]);
4629 Py_DECREF(restuple);
4630 return NULL;
4631 }
4632 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004633 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004634 Py_DECREF(restuple);
4635 return NULL;
4636 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004637 if (i_newpos<0)
4638 *newpos = size+i_newpos;
4639 else
4640 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004641 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004642 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004643 Py_DECREF(restuple);
4644 return NULL;
4645 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 Py_INCREF(resunicode);
4647 Py_DECREF(restuple);
4648 return resunicode;
4649}
4650
4651/* Lookup the character ch in the mapping and put the result in result,
4652 which must be decrefed by the caller.
4653 Return 0 on success, -1 on error */
4654static
4655int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4656{
4657 PyObject *w = PyInt_FromLong((long)c);
4658 PyObject *x;
4659
4660 if (w == NULL)
4661 return -1;
4662 x = PyObject_GetItem(mapping, w);
4663 Py_DECREF(w);
4664 if (x == NULL) {
4665 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4666 /* No mapping found means: use 1:1 mapping. */
4667 PyErr_Clear();
4668 *result = NULL;
4669 return 0;
4670 } else
4671 return -1;
4672 }
4673 else if (x == Py_None) {
4674 *result = x;
4675 return 0;
4676 }
4677 else if (PyInt_Check(x)) {
4678 long value = PyInt_AS_LONG(x);
4679 long max = PyUnicode_GetMax();
4680 if (value < 0 || value > max) {
4681 PyErr_Format(PyExc_TypeError,
4682 "character mapping must be in range(0x%lx)", max+1);
4683 Py_DECREF(x);
4684 return -1;
4685 }
4686 *result = x;
4687 return 0;
4688 }
4689 else if (PyUnicode_Check(x)) {
4690 *result = x;
4691 return 0;
4692 }
4693 else {
4694 /* wrong return value */
4695 PyErr_SetString(PyExc_TypeError,
4696 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004697 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 return -1;
4699 }
4700}
4701/* ensure that *outobj is at least requiredsize characters long,
4702if not reallocate and adjust various state variables.
4703Return 0 on success, -1 on error */
4704static
Walter Dörwald4894c302003-10-24 14:25:28 +00004705int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004706 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004707{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004708 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004709 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004710 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004711 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004713 if (requiredsize < 2 * oldsize)
4714 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004715 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004716 return -1;
4717 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 }
4719 return 0;
4720}
4721/* lookup the character, put the result in the output string and adjust
4722 various state variables. Return a new reference to the object that
4723 was put in the output buffer in *result, or Py_None, if the mapping was
4724 undefined (in which case no character was written).
4725 The called must decref result.
4726 Return 0 on success, -1 on error. */
4727static
Walter Dörwald4894c302003-10-24 14:25:28 +00004728int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004729 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004730 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004731{
Walter Dörwald4894c302003-10-24 14:25:28 +00004732 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 return -1;
4734 if (*res==NULL) {
4735 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004736 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737 }
4738 else if (*res==Py_None)
4739 ;
4740 else if (PyInt_Check(*res)) {
4741 /* no overflow check, because we know that the space is enough */
4742 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4743 }
4744 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004745 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004746 if (repsize==1) {
4747 /* no overflow check, because we know that the space is enough */
4748 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4749 }
4750 else if (repsize!=0) {
4751 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004752 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004753 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004754 repsize - 1;
4755 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004756 return -1;
4757 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4758 *outp += repsize;
4759 }
4760 }
4761 else
4762 return -1;
4763 return 0;
4764}
4765
4766PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004767 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 PyObject *mapping,
4769 const char *errors)
4770{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771 /* output object */
4772 PyObject *res = NULL;
4773 /* pointers to the beginning and end+1 of input */
4774 const Py_UNICODE *startp = p;
4775 const Py_UNICODE *endp = p + size;
4776 /* pointer into the output */
4777 Py_UNICODE *str;
4778 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004779 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 char *reason = "character maps to <undefined>";
4781 PyObject *errorHandler = NULL;
4782 PyObject *exc = NULL;
4783 /* the following variable is used for caching string comparisons
4784 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4785 * 3=ignore, 4=xmlcharrefreplace */
4786 int known_errorHandler = -1;
4787
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 if (mapping == NULL) {
4789 PyErr_BadArgument();
4790 return NULL;
4791 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792
4793 /* allocate enough for a simple 1:1 translation without
4794 replacements, if we need more, we'll resize */
4795 res = PyUnicode_FromUnicode(NULL, size);
4796 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004797 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004799 return res;
4800 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802 while (p<endp) {
4803 /* try to encode it */
4804 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004805 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004806 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 goto onError;
4808 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004809 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004810 if (x!=Py_None) /* it worked => adjust input pointer */
4811 ++p;
4812 else { /* untranslatable character */
4813 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004814 Py_ssize_t repsize;
4815 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 Py_UNICODE *uni2;
4817 /* startpos for collecting untranslatable chars */
4818 const Py_UNICODE *collstart = p;
4819 const Py_UNICODE *collend = p+1;
4820 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 /* find all untranslatable characters */
4823 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004824 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004825 goto onError;
4826 Py_XDECREF(x);
4827 if (x!=Py_None)
4828 break;
4829 ++collend;
4830 }
4831 /* cache callback name lookup
4832 * (if not done yet, i.e. it's the first error) */
4833 if (known_errorHandler==-1) {
4834 if ((errors==NULL) || (!strcmp(errors, "strict")))
4835 known_errorHandler = 1;
4836 else if (!strcmp(errors, "replace"))
4837 known_errorHandler = 2;
4838 else if (!strcmp(errors, "ignore"))
4839 known_errorHandler = 3;
4840 else if (!strcmp(errors, "xmlcharrefreplace"))
4841 known_errorHandler = 4;
4842 else
4843 known_errorHandler = 0;
4844 }
4845 switch (known_errorHandler) {
4846 case 1: /* strict */
4847 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4848 goto onError;
4849 case 2: /* replace */
4850 /* No need to check for space, this is a 1:1 replacement */
4851 for (coll = collstart; coll<collend; ++coll)
4852 *str++ = '?';
4853 /* fall through */
4854 case 3: /* ignore */
4855 p = collend;
4856 break;
4857 case 4: /* xmlcharrefreplace */
4858 /* generate replacement (temporarily (mis)uses p) */
4859 for (p = collstart; p < collend; ++p) {
4860 char buffer[2+29+1+1];
4861 char *cp;
4862 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004863 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004864 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4865 goto onError;
4866 for (cp = buffer; *cp; ++cp)
4867 *str++ = *cp;
4868 }
4869 p = collend;
4870 break;
4871 default:
4872 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4873 reason, startp, size, &exc,
4874 collstart-startp, collend-startp, &newpos);
4875 if (repunicode == NULL)
4876 goto onError;
4877 /* generate replacement */
4878 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004879 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004880 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4881 Py_DECREF(repunicode);
4882 goto onError;
4883 }
4884 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4885 *str++ = *uni2;
4886 p = startp + newpos;
4887 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 }
4889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004891 /* Resize if we allocated to much */
4892 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004893 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004894 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004895 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 }
4897 Py_XDECREF(exc);
4898 Py_XDECREF(errorHandler);
4899 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004901 onError:
4902 Py_XDECREF(res);
4903 Py_XDECREF(exc);
4904 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 return NULL;
4906}
4907
4908PyObject *PyUnicode_Translate(PyObject *str,
4909 PyObject *mapping,
4910 const char *errors)
4911{
4912 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004913
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 str = PyUnicode_FromObject(str);
4915 if (str == NULL)
4916 goto onError;
4917 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4918 PyUnicode_GET_SIZE(str),
4919 mapping,
4920 errors);
4921 Py_DECREF(str);
4922 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004923
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 onError:
4925 Py_XDECREF(str);
4926 return NULL;
4927}
Tim Petersced69f82003-09-16 20:30:58 +00004928
Guido van Rossum9e896b32000-04-05 20:11:21 +00004929/* --- Decimal Encoder ---------------------------------------------------- */
4930
4931int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004932 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004933 char *output,
4934 const char *errors)
4935{
4936 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004937 PyObject *errorHandler = NULL;
4938 PyObject *exc = NULL;
4939 const char *encoding = "decimal";
4940 const char *reason = "invalid decimal Unicode string";
4941 /* the following variable is used for caching string comparisons
4942 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4943 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004944
4945 if (output == NULL) {
4946 PyErr_BadArgument();
4947 return -1;
4948 }
4949
4950 p = s;
4951 end = s + length;
4952 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004953 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004954 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004955 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004956 Py_ssize_t repsize;
4957 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004958 Py_UNICODE *uni2;
4959 Py_UNICODE *collstart;
4960 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004961
Guido van Rossum9e896b32000-04-05 20:11:21 +00004962 if (Py_UNICODE_ISSPACE(ch)) {
4963 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004965 continue;
4966 }
4967 decimal = Py_UNICODE_TODECIMAL(ch);
4968 if (decimal >= 0) {
4969 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004970 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004971 continue;
4972 }
Guido van Rossumba477042000-04-06 18:18:10 +00004973 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004974 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004975 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004976 continue;
4977 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978 /* All other characters are considered unencodable */
4979 collstart = p;
4980 collend = p+1;
4981 while (collend < end) {
4982 if ((0 < *collend && *collend < 256) ||
4983 !Py_UNICODE_ISSPACE(*collend) ||
4984 Py_UNICODE_TODECIMAL(*collend))
4985 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004986 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004987 /* cache callback name lookup
4988 * (if not done yet, i.e. it's the first error) */
4989 if (known_errorHandler==-1) {
4990 if ((errors==NULL) || (!strcmp(errors, "strict")))
4991 known_errorHandler = 1;
4992 else if (!strcmp(errors, "replace"))
4993 known_errorHandler = 2;
4994 else if (!strcmp(errors, "ignore"))
4995 known_errorHandler = 3;
4996 else if (!strcmp(errors, "xmlcharrefreplace"))
4997 known_errorHandler = 4;
4998 else
4999 known_errorHandler = 0;
5000 }
5001 switch (known_errorHandler) {
5002 case 1: /* strict */
5003 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5004 goto onError;
5005 case 2: /* replace */
5006 for (p = collstart; p < collend; ++p)
5007 *output++ = '?';
5008 /* fall through */
5009 case 3: /* ignore */
5010 p = collend;
5011 break;
5012 case 4: /* xmlcharrefreplace */
5013 /* generate replacement (temporarily (mis)uses p) */
5014 for (p = collstart; p < collend; ++p)
5015 output += sprintf(output, "&#%d;", (int)*p);
5016 p = collend;
5017 break;
5018 default:
5019 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5020 encoding, reason, s, length, &exc,
5021 collstart-s, collend-s, &newpos);
5022 if (repunicode == NULL)
5023 goto onError;
5024 /* generate replacement */
5025 repsize = PyUnicode_GET_SIZE(repunicode);
5026 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5027 Py_UNICODE ch = *uni2;
5028 if (Py_UNICODE_ISSPACE(ch))
5029 *output++ = ' ';
5030 else {
5031 decimal = Py_UNICODE_TODECIMAL(ch);
5032 if (decimal >= 0)
5033 *output++ = '0' + decimal;
5034 else if (0 < ch && ch < 256)
5035 *output++ = (char)ch;
5036 else {
5037 Py_DECREF(repunicode);
5038 raise_encode_exception(&exc, encoding,
5039 s, length, collstart-s, collend-s, reason);
5040 goto onError;
5041 }
5042 }
5043 }
5044 p = s + newpos;
5045 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005046 }
5047 }
5048 /* 0-terminate the output string */
5049 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005050 Py_XDECREF(exc);
5051 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005052 return 0;
5053
5054 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005055 Py_XDECREF(exc);
5056 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005057 return -1;
5058}
5059
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060/* --- Helpers ------------------------------------------------------------ */
5061
Fredrik Lundha50d2012006-05-26 17:04:58 +00005062#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005063
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005064#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005065#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005066#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005067
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005068Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00005069STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
5070{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00005071 if (str[0] != other[0])
5072 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00005073 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
5074}
5075
Fredrik Lundhb9479482006-05-26 17:22:38 +00005076#define STRINGLIB_EMPTY unicode_empty
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005077#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005078
Fredrik Lundha50d2012006-05-26 17:04:58 +00005079#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005080
5081#include "stringlib/count.h"
5082#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005083#include "stringlib/partition.h"
5084
Fredrik Lundhc8162812006-05-26 19:33:03 +00005085/* helper macro to fixup start/end slice values */
5086#define FIX_START_END(obj) \
5087 if (start < 0) \
5088 start += (obj)->length; \
5089 if (start < 0) \
5090 start = 0; \
5091 if (end > (obj)->length) \
5092 end = (obj)->length; \
5093 if (end < 0) \
5094 end += (obj)->length; \
5095 if (end < 0) \
5096 end = 0;
5097
Martin v. Löwis18e16552006-02-15 17:27:45 +00005098Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005099 PyObject *substr,
5100 Py_ssize_t start,
5101 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005103 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005104 PyUnicodeObject* str_obj;
5105 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005106
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005107 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5108 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005110 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5111 if (!sub_obj) {
5112 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113 return -1;
5114 }
Tim Petersced69f82003-09-16 20:30:58 +00005115
Fredrik Lundhc8162812006-05-26 19:33:03 +00005116 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005117
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005118 result = stringlib_count(
5119 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5120 );
5121
5122 Py_DECREF(sub_obj);
5123 Py_DECREF(str_obj);
5124
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125 return result;
5126}
5127
Martin v. Löwis18e16552006-02-15 17:27:45 +00005128Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005129 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005130 Py_ssize_t start,
5131 Py_ssize_t end,
5132 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005134 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005135
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005136 str = PyUnicode_FromObject(str);
5137 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005138 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005139 sub = PyUnicode_FromObject(sub);
5140 if (!sub) {
5141 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005142 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143 }
Tim Petersced69f82003-09-16 20:30:58 +00005144
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005145 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005146 result = stringlib_find_slice(
5147 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5148 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5149 start, end
5150 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005151 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005152 result = stringlib_rfind_slice(
5153 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5154 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5155 start, end
5156 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005157
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005158 Py_DECREF(str);
5159 Py_DECREF(sub);
5160
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 return result;
5162}
5163
Tim Petersced69f82003-09-16 20:30:58 +00005164static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165int tailmatch(PyUnicodeObject *self,
5166 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005167 Py_ssize_t start,
5168 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169 int direction)
5170{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 if (substring->length == 0)
5172 return 1;
5173
Fredrik Lundhc8162812006-05-26 19:33:03 +00005174 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175
5176 end -= substring->length;
5177 if (end < start)
5178 return 0;
5179
5180 if (direction > 0) {
5181 if (Py_UNICODE_MATCH(self, end, substring))
5182 return 1;
5183 } else {
5184 if (Py_UNICODE_MATCH(self, start, substring))
5185 return 1;
5186 }
5187
5188 return 0;
5189}
5190
Martin v. Löwis18e16552006-02-15 17:27:45 +00005191Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005193 Py_ssize_t start,
5194 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 int direction)
5196{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005197 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005198
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 str = PyUnicode_FromObject(str);
5200 if (str == NULL)
5201 return -1;
5202 substr = PyUnicode_FromObject(substr);
5203 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005204 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 return -1;
5206 }
Tim Petersced69f82003-09-16 20:30:58 +00005207
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 result = tailmatch((PyUnicodeObject *)str,
5209 (PyUnicodeObject *)substr,
5210 start, end, direction);
5211 Py_DECREF(str);
5212 Py_DECREF(substr);
5213 return result;
5214}
5215
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216/* Apply fixfct filter to the Unicode object self and return a
5217 reference to the modified object */
5218
Tim Petersced69f82003-09-16 20:30:58 +00005219static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220PyObject *fixup(PyUnicodeObject *self,
5221 int (*fixfct)(PyUnicodeObject *s))
5222{
5223
5224 PyUnicodeObject *u;
5225
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005226 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 if (u == NULL)
5228 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005229
5230 Py_UNICODE_COPY(u->str, self->str, self->length);
5231
Tim Peters7a29bd52001-09-12 03:03:31 +00005232 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 /* fixfct should return TRUE if it modified the buffer. If
5234 FALSE, return a reference to the original buffer instead
5235 (to save space, not time) */
5236 Py_INCREF(self);
5237 Py_DECREF(u);
5238 return (PyObject*) self;
5239 }
5240 return (PyObject*) u;
5241}
5242
Tim Petersced69f82003-09-16 20:30:58 +00005243static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244int fixupper(PyUnicodeObject *self)
5245{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005246 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 Py_UNICODE *s = self->str;
5248 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005249
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250 while (len-- > 0) {
5251 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005252
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 ch = Py_UNICODE_TOUPPER(*s);
5254 if (ch != *s) {
5255 status = 1;
5256 *s = ch;
5257 }
5258 s++;
5259 }
5260
5261 return status;
5262}
5263
Tim Petersced69f82003-09-16 20:30:58 +00005264static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265int fixlower(PyUnicodeObject *self)
5266{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005267 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 Py_UNICODE *s = self->str;
5269 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005270
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 while (len-- > 0) {
5272 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005273
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 ch = Py_UNICODE_TOLOWER(*s);
5275 if (ch != *s) {
5276 status = 1;
5277 *s = ch;
5278 }
5279 s++;
5280 }
5281
5282 return status;
5283}
5284
Tim Petersced69f82003-09-16 20:30:58 +00005285static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286int fixswapcase(PyUnicodeObject *self)
5287{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005288 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 Py_UNICODE *s = self->str;
5290 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005291
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 while (len-- > 0) {
5293 if (Py_UNICODE_ISUPPER(*s)) {
5294 *s = Py_UNICODE_TOLOWER(*s);
5295 status = 1;
5296 } else if (Py_UNICODE_ISLOWER(*s)) {
5297 *s = Py_UNICODE_TOUPPER(*s);
5298 status = 1;
5299 }
5300 s++;
5301 }
5302
5303 return status;
5304}
5305
Tim Petersced69f82003-09-16 20:30:58 +00005306static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307int fixcapitalize(PyUnicodeObject *self)
5308{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005309 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005310 Py_UNICODE *s = self->str;
5311 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005312
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005313 if (len == 0)
5314 return 0;
5315 if (Py_UNICODE_ISLOWER(*s)) {
5316 *s = Py_UNICODE_TOUPPER(*s);
5317 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005319 s++;
5320 while (--len > 0) {
5321 if (Py_UNICODE_ISUPPER(*s)) {
5322 *s = Py_UNICODE_TOLOWER(*s);
5323 status = 1;
5324 }
5325 s++;
5326 }
5327 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328}
5329
5330static
5331int fixtitle(PyUnicodeObject *self)
5332{
5333 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5334 register Py_UNICODE *e;
5335 int previous_is_cased;
5336
5337 /* Shortcut for single character strings */
5338 if (PyUnicode_GET_SIZE(self) == 1) {
5339 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5340 if (*p != ch) {
5341 *p = ch;
5342 return 1;
5343 }
5344 else
5345 return 0;
5346 }
Tim Petersced69f82003-09-16 20:30:58 +00005347
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 e = p + PyUnicode_GET_SIZE(self);
5349 previous_is_cased = 0;
5350 for (; p < e; p++) {
5351 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005352
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 if (previous_is_cased)
5354 *p = Py_UNICODE_TOLOWER(ch);
5355 else
5356 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005357
5358 if (Py_UNICODE_ISLOWER(ch) ||
5359 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 Py_UNICODE_ISTITLE(ch))
5361 previous_is_cased = 1;
5362 else
5363 previous_is_cased = 0;
5364 }
5365 return 1;
5366}
5367
Tim Peters8ce9f162004-08-27 01:49:32 +00005368PyObject *
5369PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370{
Tim Peters8ce9f162004-08-27 01:49:32 +00005371 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005372 const Py_UNICODE blank = ' ';
5373 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005374 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005375 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005376 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5377 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005378 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5379 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005380 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005381 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005382 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383
Tim Peters05eba1f2004-08-27 21:32:02 +00005384 fseq = PySequence_Fast(seq, "");
5385 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005386 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005387 }
5388
Tim Peters91879ab2004-08-27 22:35:44 +00005389 /* Grrrr. A codec may be invoked to convert str objects to
5390 * Unicode, and so it's possible to call back into Python code
5391 * during PyUnicode_FromObject(), and so it's possible for a sick
5392 * codec to change the size of fseq (if seq is a list). Therefore
5393 * we have to keep refetching the size -- can't assume seqlen
5394 * is invariant.
5395 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005396 seqlen = PySequence_Fast_GET_SIZE(fseq);
5397 /* If empty sequence, return u"". */
5398 if (seqlen == 0) {
5399 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5400 goto Done;
5401 }
5402 /* If singleton sequence with an exact Unicode, return that. */
5403 if (seqlen == 1) {
5404 item = PySequence_Fast_GET_ITEM(fseq, 0);
5405 if (PyUnicode_CheckExact(item)) {
5406 Py_INCREF(item);
5407 res = (PyUnicodeObject *)item;
5408 goto Done;
5409 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005410 }
5411
Tim Peters05eba1f2004-08-27 21:32:02 +00005412 /* At least two items to join, or one that isn't exact Unicode. */
5413 if (seqlen > 1) {
5414 /* Set up sep and seplen -- they're needed. */
5415 if (separator == NULL) {
5416 sep = &blank;
5417 seplen = 1;
5418 }
5419 else {
5420 internal_separator = PyUnicode_FromObject(separator);
5421 if (internal_separator == NULL)
5422 goto onError;
5423 sep = PyUnicode_AS_UNICODE(internal_separator);
5424 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005425 /* In case PyUnicode_FromObject() mutated seq. */
5426 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005427 }
5428 }
5429
5430 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005431 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005432 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005433 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005434 res_p = PyUnicode_AS_UNICODE(res);
5435 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005436
Tim Peters05eba1f2004-08-27 21:32:02 +00005437 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00005438 Py_ssize_t itemlen;
5439 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005440
5441 item = PySequence_Fast_GET_ITEM(fseq, i);
5442 /* Convert item to Unicode. */
5443 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5444 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00005445 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005446 " %.80s found",
Christian Heimese93237d2007-12-19 02:37:44 +00005447 i, Py_TYPE(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005448 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005449 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005450 item = PyUnicode_FromObject(item);
5451 if (item == NULL)
5452 goto onError;
5453 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005454
Tim Peters91879ab2004-08-27 22:35:44 +00005455 /* In case PyUnicode_FromObject() mutated seq. */
5456 seqlen = PySequence_Fast_GET_SIZE(fseq);
5457
Tim Peters8ce9f162004-08-27 01:49:32 +00005458 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005460 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005461 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005462 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005463 if (i < seqlen - 1) {
5464 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00005465 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005466 goto Overflow;
5467 }
5468 if (new_res_used > res_alloc) {
5469 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005470 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005471 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00005472 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005473 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005474 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00005475 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005476 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005478 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005479 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005481
5482 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005483 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005484 res_p += itemlen;
5485 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00005486 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005487 res_p += seplen;
5488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005490 res_used = new_res_used;
5491 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005492
Tim Peters05eba1f2004-08-27 21:32:02 +00005493 /* Shrink res to match the used area; this probably can't fail,
5494 * but it's cheap to check.
5495 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005496 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005497 goto onError;
5498
5499 Done:
5500 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005501 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 return (PyObject *)res;
5503
Tim Peters8ce9f162004-08-27 01:49:32 +00005504 Overflow:
5505 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005506 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005507 Py_DECREF(item);
5508 /* fall through */
5509
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005511 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005512 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005513 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 return NULL;
5515}
5516
Tim Petersced69f82003-09-16 20:30:58 +00005517static
5518PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005519 Py_ssize_t left,
5520 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 Py_UNICODE fill)
5522{
5523 PyUnicodeObject *u;
5524
5525 if (left < 0)
5526 left = 0;
5527 if (right < 0)
5528 right = 0;
5529
Tim Peters7a29bd52001-09-12 03:03:31 +00005530 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 Py_INCREF(self);
5532 return self;
5533 }
5534
5535 u = _PyUnicode_New(left + self->length + right);
5536 if (u) {
5537 if (left)
5538 Py_UNICODE_FILL(u->str, fill, left);
5539 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5540 if (right)
5541 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5542 }
5543
5544 return u;
5545}
5546
5547#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005548 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 if (!str) \
5550 goto onError; \
5551 if (PyList_Append(list, str)) { \
5552 Py_DECREF(str); \
5553 goto onError; \
5554 } \
5555 else \
5556 Py_DECREF(str);
5557
5558static
5559PyObject *split_whitespace(PyUnicodeObject *self,
5560 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005561 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005563 register Py_ssize_t i;
5564 register Py_ssize_t j;
5565 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005567 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568
5569 for (i = j = 0; i < len; ) {
5570 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005571 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 i++;
5573 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005574 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575 i++;
5576 if (j < i) {
5577 if (maxcount-- <= 0)
5578 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005579 SPLIT_APPEND(buf, j, i);
5580 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 i++;
5582 j = i;
5583 }
5584 }
5585 if (j < len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005586 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 }
5588 return list;
5589
5590 onError:
5591 Py_DECREF(list);
5592 return NULL;
5593}
5594
5595PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005596 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005598 register Py_ssize_t i;
5599 register Py_ssize_t j;
5600 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 PyObject *list;
5602 PyObject *str;
5603 Py_UNICODE *data;
5604
5605 string = PyUnicode_FromObject(string);
5606 if (string == NULL)
5607 return NULL;
5608 data = PyUnicode_AS_UNICODE(string);
5609 len = PyUnicode_GET_SIZE(string);
5610
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 list = PyList_New(0);
5612 if (!list)
5613 goto onError;
5614
5615 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005616 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005617
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005619 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621
5622 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005623 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 if (i < len) {
5625 if (data[i] == '\r' && i + 1 < len &&
5626 data[i+1] == '\n')
5627 i += 2;
5628 else
5629 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005630 if (keepends)
5631 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 }
Guido van Rossum86662912000-04-11 15:38:46 +00005633 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 j = i;
5635 }
5636 if (j < len) {
5637 SPLIT_APPEND(data, j, len);
5638 }
5639
5640 Py_DECREF(string);
5641 return list;
5642
5643 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005644 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 Py_DECREF(string);
5646 return NULL;
5647}
5648
Tim Petersced69f82003-09-16 20:30:58 +00005649static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650PyObject *split_char(PyUnicodeObject *self,
5651 PyObject *list,
5652 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005653 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005655 register Py_ssize_t i;
5656 register Py_ssize_t j;
5657 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005659 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660
5661 for (i = j = 0; i < len; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005662 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 if (maxcount-- <= 0)
5664 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005665 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 i = j = i + 1;
5667 } else
5668 i++;
5669 }
5670 if (j <= len) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005671 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672 }
5673 return list;
5674
5675 onError:
5676 Py_DECREF(list);
5677 return NULL;
5678}
5679
Tim Petersced69f82003-09-16 20:30:58 +00005680static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681PyObject *split_substring(PyUnicodeObject *self,
5682 PyObject *list,
5683 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005684 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005686 register Py_ssize_t i;
5687 register Py_ssize_t j;
5688 Py_ssize_t len = self->length;
5689 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 PyObject *str;
5691
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005692 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 if (Py_UNICODE_MATCH(self, i, substring)) {
5694 if (maxcount-- <= 0)
5695 break;
5696 SPLIT_APPEND(self->str, j, i);
5697 i = j = i + sublen;
5698 } else
5699 i++;
5700 }
5701 if (j <= len) {
5702 SPLIT_APPEND(self->str, j, len);
5703 }
5704 return list;
5705
5706 onError:
5707 Py_DECREF(list);
5708 return NULL;
5709}
5710
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005711static
5712PyObject *rsplit_whitespace(PyUnicodeObject *self,
5713 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005714 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005715{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005716 register Py_ssize_t i;
5717 register Py_ssize_t j;
5718 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005719 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005720 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005721
5722 for (i = j = len - 1; i >= 0; ) {
5723 /* find a token */
Christian Heimes4d4f2702008-01-30 11:32:37 +00005724 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005725 i--;
5726 j = i;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005727 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005728 i--;
5729 if (j > i) {
5730 if (maxcount-- <= 0)
5731 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005732 SPLIT_APPEND(buf, i + 1, j + 1);
5733 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005734 i--;
5735 j = i;
5736 }
5737 }
5738 if (j >= 0) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005739 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005740 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005741 if (PyList_Reverse(list) < 0)
5742 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005743 return list;
5744
5745 onError:
5746 Py_DECREF(list);
5747 return NULL;
5748}
5749
5750static
5751PyObject *rsplit_char(PyUnicodeObject *self,
5752 PyObject *list,
5753 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005754 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005755{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005756 register Py_ssize_t i;
5757 register Py_ssize_t j;
5758 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005759 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005760 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005761
5762 for (i = j = len - 1; i >= 0; ) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005763 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005764 if (maxcount-- <= 0)
5765 break;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005766 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005767 j = i = i - 1;
5768 } else
5769 i--;
5770 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005771 if (j >= -1) {
Christian Heimes4d4f2702008-01-30 11:32:37 +00005772 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005773 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005774 if (PyList_Reverse(list) < 0)
5775 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005776 return list;
5777
5778 onError:
5779 Py_DECREF(list);
5780 return NULL;
5781}
5782
5783static
5784PyObject *rsplit_substring(PyUnicodeObject *self,
5785 PyObject *list,
5786 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005787 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005788{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005789 register Py_ssize_t i;
5790 register Py_ssize_t j;
5791 Py_ssize_t len = self->length;
5792 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005793 PyObject *str;
5794
5795 for (i = len - sublen, j = len; i >= 0; ) {
5796 if (Py_UNICODE_MATCH(self, i, substring)) {
5797 if (maxcount-- <= 0)
5798 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005799 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005800 j = i;
5801 i -= sublen;
5802 } else
5803 i--;
5804 }
5805 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005806 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005807 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005808 if (PyList_Reverse(list) < 0)
5809 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005810 return list;
5811
5812 onError:
5813 Py_DECREF(list);
5814 return NULL;
5815}
5816
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817#undef SPLIT_APPEND
5818
5819static
5820PyObject *split(PyUnicodeObject *self,
5821 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005822 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823{
5824 PyObject *list;
5825
5826 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005827 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828
5829 list = PyList_New(0);
5830 if (!list)
5831 return NULL;
5832
5833 if (substring == NULL)
5834 return split_whitespace(self,list,maxcount);
5835
5836 else if (substring->length == 1)
5837 return split_char(self,list,substring->str[0],maxcount);
5838
5839 else if (substring->length == 0) {
5840 Py_DECREF(list);
5841 PyErr_SetString(PyExc_ValueError, "empty separator");
5842 return NULL;
5843 }
5844 else
5845 return split_substring(self,list,substring,maxcount);
5846}
5847
Tim Petersced69f82003-09-16 20:30:58 +00005848static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005849PyObject *rsplit(PyUnicodeObject *self,
5850 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005851 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005852{
5853 PyObject *list;
5854
5855 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005856 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005857
5858 list = PyList_New(0);
5859 if (!list)
5860 return NULL;
5861
5862 if (substring == NULL)
5863 return rsplit_whitespace(self,list,maxcount);
5864
5865 else if (substring->length == 1)
5866 return rsplit_char(self,list,substring->str[0],maxcount);
5867
5868 else if (substring->length == 0) {
5869 Py_DECREF(list);
5870 PyErr_SetString(PyExc_ValueError, "empty separator");
5871 return NULL;
5872 }
5873 else
5874 return rsplit_substring(self,list,substring,maxcount);
5875}
5876
5877static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878PyObject *replace(PyUnicodeObject *self,
5879 PyUnicodeObject *str1,
5880 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005881 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882{
5883 PyUnicodeObject *u;
5884
5885 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005886 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887
Fredrik Lundh347ee272006-05-24 16:35:18 +00005888 if (str1->length == str2->length) {
5889 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005890 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005891 if (str1->length == 1) {
5892 /* replace characters */
5893 Py_UNICODE u1, u2;
5894 if (!findchar(self->str, self->length, str1->str[0]))
5895 goto nothing;
5896 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5897 if (!u)
5898 return NULL;
5899 Py_UNICODE_COPY(u->str, self->str, self->length);
5900 u1 = str1->str[0];
5901 u2 = str2->str[0];
5902 for (i = 0; i < u->length; i++)
5903 if (u->str[i] == u1) {
5904 if (--maxcount < 0)
5905 break;
5906 u->str[i] = u2;
5907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005909 i = fastsearch(
5910 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005912 if (i < 0)
5913 goto nothing;
5914 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5915 if (!u)
5916 return NULL;
5917 Py_UNICODE_COPY(u->str, self->str, self->length);
5918 while (i <= self->length - str1->length)
5919 if (Py_UNICODE_MATCH(self, i, str1)) {
5920 if (--maxcount < 0)
5921 break;
5922 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5923 i += str1->length;
5924 } else
5925 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005928
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005929 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005930 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 Py_UNICODE *p;
5932
5933 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005934 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 if (n > maxcount)
5936 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005937 if (n == 0)
5938 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005939 /* new_size = self->length + n * (str2->length - str1->length)); */
5940 delta = (str2->length - str1->length);
5941 if (delta == 0) {
5942 new_size = self->length;
5943 } else {
5944 product = n * (str2->length - str1->length);
5945 if ((product / (str2->length - str1->length)) != n) {
5946 PyErr_SetString(PyExc_OverflowError,
5947 "replace string is too long");
5948 return NULL;
5949 }
5950 new_size = self->length + product;
5951 if (new_size < 0) {
5952 PyErr_SetString(PyExc_OverflowError,
5953 "replace string is too long");
5954 return NULL;
5955 }
5956 }
5957 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005958 if (!u)
5959 return NULL;
5960 i = 0;
5961 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005962 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005963 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005964 while (n-- > 0) {
5965 /* look for next match */
5966 j = i;
5967 while (j <= e) {
5968 if (Py_UNICODE_MATCH(self, j, str1))
5969 break;
5970 j++;
5971 }
5972 if (j > i) {
5973 if (j > e)
5974 break;
5975 /* copy unchanged part [i:j] */
5976 Py_UNICODE_COPY(p, self->str+i, j-i);
5977 p += j - i;
5978 }
5979 /* copy substitution string */
5980 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005981 Py_UNICODE_COPY(p, str2->str, str2->length);
5982 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005983 }
5984 i = j + str1->length;
5985 }
5986 if (i < self->length)
5987 /* copy tail [i:] */
5988 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005989 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005990 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005991 while (n > 0) {
5992 Py_UNICODE_COPY(p, str2->str, str2->length);
5993 p += str2->length;
5994 if (--n <= 0)
5995 break;
5996 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005998 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 }
6000 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006002
6003nothing:
6004 /* nothing to replace; return original string (when possible) */
6005 if (PyUnicode_CheckExact(self)) {
6006 Py_INCREF(self);
6007 return (PyObject *) self;
6008 }
6009 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010}
6011
6012/* --- Unicode Object Methods --------------------------------------------- */
6013
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006014PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015"S.title() -> unicode\n\
6016\n\
6017Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006018characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019
6020static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006021unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 return fixup(self, fixtitle);
6024}
6025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006026PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027"S.capitalize() -> unicode\n\
6028\n\
6029Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006030have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031
6032static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006033unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 return fixup(self, fixcapitalize);
6036}
6037
6038#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006039PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040"S.capwords() -> unicode\n\
6041\n\
6042Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006043normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044
6045static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006046unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047{
6048 PyObject *list;
6049 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006050 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 /* Split into words */
6053 list = split(self, NULL, -1);
6054 if (!list)
6055 return NULL;
6056
6057 /* Capitalize each word */
6058 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6059 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6060 fixcapitalize);
6061 if (item == NULL)
6062 goto onError;
6063 Py_DECREF(PyList_GET_ITEM(list, i));
6064 PyList_SET_ITEM(list, i, item);
6065 }
6066
6067 /* Join the words to form a new string */
6068 item = PyUnicode_Join(NULL, list);
6069
6070onError:
6071 Py_DECREF(list);
6072 return (PyObject *)item;
6073}
6074#endif
6075
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006076/* Argument converter. Coerces to a single unicode character */
6077
6078static int
6079convert_uc(PyObject *obj, void *addr)
6080{
6081 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6082 PyObject *uniobj;
6083 Py_UNICODE *unistr;
6084
6085 uniobj = PyUnicode_FromObject(obj);
6086 if (uniobj == NULL) {
6087 PyErr_SetString(PyExc_TypeError,
6088 "The fill character cannot be converted to Unicode");
6089 return 0;
6090 }
6091 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6092 PyErr_SetString(PyExc_TypeError,
6093 "The fill character must be exactly one character long");
6094 Py_DECREF(uniobj);
6095 return 0;
6096 }
6097 unistr = PyUnicode_AS_UNICODE(uniobj);
6098 *fillcharloc = unistr[0];
6099 Py_DECREF(uniobj);
6100 return 1;
6101}
6102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006103PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006104"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006106Return S centered in a Unicode string of length width. Padding is\n\
6107done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108
6109static PyObject *
6110unicode_center(PyUnicodeObject *self, PyObject *args)
6111{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006112 Py_ssize_t marg, left;
6113 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006114 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115
Thomas Woutersde017742006-02-16 19:34:37 +00006116 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 return NULL;
6118
Tim Peters7a29bd52001-09-12 03:03:31 +00006119 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 Py_INCREF(self);
6121 return (PyObject*) self;
6122 }
6123
6124 marg = width - self->length;
6125 left = marg / 2 + (marg & width & 1);
6126
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006127 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128}
6129
Marc-André Lemburge5034372000-08-08 08:04:29 +00006130#if 0
6131
6132/* This code should go into some future Unicode collation support
6133 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006134 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006135
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006136/* speedy UTF-16 code point order comparison */
6137/* gleaned from: */
6138/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6139
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006140static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006141{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006142 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006143 0, 0, 0, 0, 0, 0, 0, 0,
6144 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006145 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006146};
6147
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148static int
6149unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6150{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006151 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006152
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 Py_UNICODE *s1 = str1->str;
6154 Py_UNICODE *s2 = str2->str;
6155
6156 len1 = str1->length;
6157 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006158
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006160 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006161
6162 c1 = *s1++;
6163 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006164
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006165 if (c1 > (1<<11) * 26)
6166 c1 += utf16Fixup[c1>>11];
6167 if (c2 > (1<<11) * 26)
6168 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006169 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006170
6171 if (c1 != c2)
6172 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006173
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006174 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 }
6176
6177 return (len1 < len2) ? -1 : (len1 != len2);
6178}
6179
Marc-André Lemburge5034372000-08-08 08:04:29 +00006180#else
6181
6182static int
6183unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6184{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006185 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006186
6187 Py_UNICODE *s1 = str1->str;
6188 Py_UNICODE *s2 = str2->str;
6189
6190 len1 = str1->length;
6191 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006192
Marc-André Lemburge5034372000-08-08 08:04:29 +00006193 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006194 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006195
Fredrik Lundh45714e92001-06-26 16:39:36 +00006196 c1 = *s1++;
6197 c2 = *s2++;
6198
6199 if (c1 != c2)
6200 return (c1 < c2) ? -1 : 1;
6201
Marc-André Lemburge5034372000-08-08 08:04:29 +00006202 len1--; len2--;
6203 }
6204
6205 return (len1 < len2) ? -1 : (len1 != len2);
6206}
6207
6208#endif
6209
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210int PyUnicode_Compare(PyObject *left,
6211 PyObject *right)
6212{
6213 PyUnicodeObject *u = NULL, *v = NULL;
6214 int result;
6215
6216 /* Coerce the two arguments */
6217 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6218 if (u == NULL)
6219 goto onError;
6220 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6221 if (v == NULL)
6222 goto onError;
6223
Thomas Wouters7e474022000-07-16 12:04:32 +00006224 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 if (v == u) {
6226 Py_DECREF(u);
6227 Py_DECREF(v);
6228 return 0;
6229 }
6230
6231 result = unicode_compare(u, v);
6232
6233 Py_DECREF(u);
6234 Py_DECREF(v);
6235 return result;
6236
6237onError:
6238 Py_XDECREF(u);
6239 Py_XDECREF(v);
6240 return -1;
6241}
6242
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006243PyObject *PyUnicode_RichCompare(PyObject *left,
6244 PyObject *right,
6245 int op)
6246{
6247 int result;
6248
6249 result = PyUnicode_Compare(left, right);
6250 if (result == -1 && PyErr_Occurred())
6251 goto onError;
6252
6253 /* Convert the return value to a Boolean */
6254 switch (op) {
6255 case Py_EQ:
6256 result = (result == 0);
6257 break;
6258 case Py_NE:
6259 result = (result != 0);
6260 break;
6261 case Py_LE:
6262 result = (result <= 0);
6263 break;
6264 case Py_GE:
6265 result = (result >= 0);
6266 break;
6267 case Py_LT:
6268 result = (result == -1);
6269 break;
6270 case Py_GT:
6271 result = (result == 1);
6272 break;
6273 }
6274 return PyBool_FromLong(result);
6275
6276 onError:
6277
6278 /* Standard case
6279
6280 Type errors mean that PyUnicode_FromObject() could not convert
6281 one of the arguments (usually the right hand side) to Unicode,
6282 ie. we can't handle the comparison request. However, it is
6283 possible that the other object knows a comparison method, which
6284 is why we return Py_NotImplemented to give the other object a
6285 chance.
6286
6287 */
6288 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6289 PyErr_Clear();
6290 Py_INCREF(Py_NotImplemented);
6291 return Py_NotImplemented;
6292 }
6293 if (op != Py_EQ && op != Py_NE)
6294 return NULL;
6295
6296 /* Equality comparison.
6297
6298 This is a special case: we silence any PyExc_UnicodeDecodeError
6299 and instead turn it into a PyErr_UnicodeWarning.
6300
6301 */
6302 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6303 return NULL;
6304 PyErr_Clear();
6305 if (PyErr_Warn(PyExc_UnicodeWarning,
6306 (op == Py_EQ) ?
6307 "Unicode equal comparison "
6308 "failed to convert both arguments to Unicode - "
6309 "interpreting them as being unequal" :
6310 "Unicode unequal comparison "
6311 "failed to convert both arguments to Unicode - "
6312 "interpreting them as being unequal"
6313 ) < 0)
6314 return NULL;
6315 result = (op == Py_NE);
6316 return PyBool_FromLong(result);
6317}
6318
Guido van Rossum403d68b2000-03-13 15:55:09 +00006319int PyUnicode_Contains(PyObject *container,
6320 PyObject *element)
6321{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006322 PyObject *str, *sub;
6323 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006324
6325 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006326 sub = PyUnicode_FromObject(element);
6327 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006328 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00006329 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006330 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006331 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006332
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006333 str = PyUnicode_FromObject(container);
6334 if (!str) {
6335 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006336 return -1;
6337 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006338
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006339 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006340
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006341 Py_DECREF(str);
6342 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006343
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006344 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006345}
6346
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347/* Concat to string or Unicode object giving a new Unicode object. */
6348
6349PyObject *PyUnicode_Concat(PyObject *left,
6350 PyObject *right)
6351{
6352 PyUnicodeObject *u = NULL, *v = NULL, *w;
6353
6354 /* Coerce the two arguments */
6355 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6356 if (u == NULL)
6357 goto onError;
6358 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6359 if (v == NULL)
6360 goto onError;
6361
6362 /* Shortcuts */
6363 if (v == unicode_empty) {
6364 Py_DECREF(v);
6365 return (PyObject *)u;
6366 }
6367 if (u == unicode_empty) {
6368 Py_DECREF(u);
6369 return (PyObject *)v;
6370 }
6371
6372 /* Concat the two Unicode strings */
6373 w = _PyUnicode_New(u->length + v->length);
6374 if (w == NULL)
6375 goto onError;
6376 Py_UNICODE_COPY(w->str, u->str, u->length);
6377 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6378
6379 Py_DECREF(u);
6380 Py_DECREF(v);
6381 return (PyObject *)w;
6382
6383onError:
6384 Py_XDECREF(u);
6385 Py_XDECREF(v);
6386 return NULL;
6387}
6388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006389PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390"S.count(sub[, start[, end]]) -> int\n\
6391\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006392Return the number of non-overlapping occurrences of substring sub in\n\
6393Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006394interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395
6396static PyObject *
6397unicode_count(PyUnicodeObject *self, PyObject *args)
6398{
6399 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006400 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006401 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 PyObject *result;
6403
Guido van Rossumb8872e62000-05-09 14:14:27 +00006404 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6405 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 return NULL;
6407
6408 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006409 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 if (substring == NULL)
6411 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006412
Fredrik Lundhc8162812006-05-26 19:33:03 +00006413 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006415 result = PyInt_FromSsize_t(
6416 stringlib_count(self->str + start, end - start,
6417 substring->str, substring->length)
6418 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419
6420 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006421
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 return result;
6423}
6424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006425PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006426"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006428Encodes S using the codec registered for encoding. encoding defaults\n\
6429to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006430handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006431a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6432'xmlcharrefreplace' as well as any other name registered with\n\
6433codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434
6435static PyObject *
6436unicode_encode(PyUnicodeObject *self, PyObject *args)
6437{
6438 char *encoding = NULL;
6439 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006440 PyObject *v;
6441
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6443 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006444 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006445 if (v == NULL)
6446 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006447 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6448 PyErr_Format(PyExc_TypeError,
6449 "encoder did not return a string/unicode object "
6450 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006451 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006452 Py_DECREF(v);
6453 return NULL;
6454 }
6455 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006456
6457 onError:
6458 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006459}
6460
6461PyDoc_STRVAR(decode__doc__,
6462"S.decode([encoding[,errors]]) -> string or unicode\n\
6463\n\
6464Decodes S using the codec registered for encoding. encoding defaults\n\
6465to the default encoding. errors may be given to set a different error\n\
6466handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6467a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6468as well as any other name registerd with codecs.register_error that is\n\
6469able to handle UnicodeDecodeErrors.");
6470
6471static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006472unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006473{
6474 char *encoding = NULL;
6475 char *errors = NULL;
6476 PyObject *v;
6477
6478 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6479 return NULL;
6480 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006481 if (v == NULL)
6482 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006483 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6484 PyErr_Format(PyExc_TypeError,
6485 "decoder did not return a string/unicode object "
6486 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006487 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006488 Py_DECREF(v);
6489 return NULL;
6490 }
6491 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006492
6493 onError:
6494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495}
6496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006497PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498"S.expandtabs([tabsize]) -> unicode\n\
6499\n\
6500Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006501If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502
6503static PyObject*
6504unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6505{
6506 Py_UNICODE *e;
6507 Py_UNICODE *p;
6508 Py_UNICODE *q;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006509 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 PyUnicodeObject *u;
6511 int tabsize = 8;
6512
6513 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6514 return NULL;
6515
Thomas Wouters7e474022000-07-16 12:04:32 +00006516 /* First pass: determine size of output string */
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006517 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 e = self->str + self->length;
6519 for (p = self->str; p < e; p++)
6520 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006521 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 j += tabsize - (j % tabsize);
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006523 if (old_j > j) {
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00006524 PyErr_SetString(PyExc_OverflowError,
6525 "new string is too long");
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006526 return NULL;
6527 }
6528 old_j = j;
6529 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 }
6531 else {
6532 j++;
6533 if (*p == '\n' || *p == '\r') {
6534 i += j;
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00006535 old_j = j = 0;
6536 if (i < 0) {
6537 PyErr_SetString(PyExc_OverflowError,
6538 "new string is too long");
6539 return NULL;
6540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 }
6542 }
6543
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006544 if ((i + j) < 0) {
6545 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6546 return NULL;
6547 }
6548
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 /* Second pass: create output string and fill it */
6550 u = _PyUnicode_New(i + j);
6551 if (!u)
6552 return NULL;
6553
6554 j = 0;
6555 q = u->str;
6556
6557 for (p = self->str; p < e; p++)
6558 if (*p == '\t') {
6559 if (tabsize > 0) {
6560 i = tabsize - (j % tabsize);
6561 j += i;
6562 while (i--)
6563 *q++ = ' ';
6564 }
6565 }
6566 else {
6567 j++;
6568 *q++ = *p;
6569 if (*p == '\n' || *p == '\r')
6570 j = 0;
6571 }
6572
6573 return (PyObject*) u;
6574}
6575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006576PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577"S.find(sub [,start [,end]]) -> int\n\
6578\n\
6579Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006580such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581arguments start and end are interpreted as in slice notation.\n\
6582\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006583Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584
6585static PyObject *
6586unicode_find(PyUnicodeObject *self, PyObject *args)
6587{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006588 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006589 Py_ssize_t start;
6590 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006591 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592
Facundo Batista57d56692007-11-16 18:04:14 +00006593 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006596 result = stringlib_find_slice(
6597 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6598 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6599 start, end
6600 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601
6602 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006603
6604 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605}
6606
6607static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006608unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609{
6610 if (index < 0 || index >= self->length) {
6611 PyErr_SetString(PyExc_IndexError, "string index out of range");
6612 return NULL;
6613 }
6614
6615 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6616}
6617
6618static long
6619unicode_hash(PyUnicodeObject *self)
6620{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006621 /* Since Unicode objects compare equal to their ASCII string
6622 counterparts, they should use the individual character values
6623 as basis for their hash value. This is needed to assure that
6624 strings and Unicode objects behave in the same way as
6625 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626
Martin v. Löwis18e16552006-02-15 17:27:45 +00006627 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006628 register Py_UNICODE *p;
6629 register long x;
6630
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 if (self->hash != -1)
6632 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006633 len = PyUnicode_GET_SIZE(self);
6634 p = PyUnicode_AS_UNICODE(self);
6635 x = *p << 7;
6636 while (--len >= 0)
6637 x = (1000003*x) ^ *p++;
6638 x ^= PyUnicode_GET_SIZE(self);
6639 if (x == -1)
6640 x = -2;
6641 self->hash = x;
6642 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643}
6644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006645PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646"S.index(sub [,start [,end]]) -> int\n\
6647\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006648Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649
6650static PyObject *
6651unicode_index(PyUnicodeObject *self, PyObject *args)
6652{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006653 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006654 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006655 Py_ssize_t start;
6656 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657
Facundo Batista57d56692007-11-16 18:04:14 +00006658 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006661 result = stringlib_find_slice(
6662 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6663 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6664 start, end
6665 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666
6667 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006668
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 if (result < 0) {
6670 PyErr_SetString(PyExc_ValueError, "substring not found");
6671 return NULL;
6672 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006673
Martin v. Löwis18e16552006-02-15 17:27:45 +00006674 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675}
6676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006677PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006678"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006680Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006681at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
6683static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006684unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685{
6686 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6687 register const Py_UNICODE *e;
6688 int cased;
6689
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 /* Shortcut for single character strings */
6691 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006692 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006694 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006695 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006696 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 e = p + PyUnicode_GET_SIZE(self);
6699 cased = 0;
6700 for (; p < e; p++) {
6701 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006702
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006704 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 else if (!cased && Py_UNICODE_ISLOWER(ch))
6706 cased = 1;
6707 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006708 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709}
6710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006711PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006712"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006714Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006715at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716
6717static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006718unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719{
6720 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6721 register const Py_UNICODE *e;
6722 int cased;
6723
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 /* Shortcut for single character strings */
6725 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006726 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006728 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006729 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006730 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006731
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732 e = p + PyUnicode_GET_SIZE(self);
6733 cased = 0;
6734 for (; p < e; p++) {
6735 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006736
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006738 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 else if (!cased && Py_UNICODE_ISUPPER(ch))
6740 cased = 1;
6741 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006742 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743}
6744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006745PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006746"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006748Return True if S is a titlecased string and there is at least one\n\
6749character in S, i.e. upper- and titlecase characters may only\n\
6750follow uncased characters and lowercase characters only cased ones.\n\
6751Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752
6753static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006754unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755{
6756 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6757 register const Py_UNICODE *e;
6758 int cased, previous_is_cased;
6759
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 /* Shortcut for single character strings */
6761 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006762 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6763 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006765 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006766 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006767 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006768
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 e = p + PyUnicode_GET_SIZE(self);
6770 cased = 0;
6771 previous_is_cased = 0;
6772 for (; p < e; p++) {
6773 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006774
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6776 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006777 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 previous_is_cased = 1;
6779 cased = 1;
6780 }
6781 else if (Py_UNICODE_ISLOWER(ch)) {
6782 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006783 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 previous_is_cased = 1;
6785 cased = 1;
6786 }
6787 else
6788 previous_is_cased = 0;
6789 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006790 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791}
6792
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006793PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006794"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006796Return True if all characters in S are whitespace\n\
6797and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798
6799static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006800unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801{
6802 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6803 register const Py_UNICODE *e;
6804
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 /* Shortcut for single character strings */
6806 if (PyUnicode_GET_SIZE(self) == 1 &&
6807 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006808 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006810 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006811 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006812 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006813
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 e = p + PyUnicode_GET_SIZE(self);
6815 for (; p < e; p++) {
6816 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006817 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006819 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820}
6821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006822PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006823"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006824\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006825Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006826and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006827
6828static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006829unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006830{
6831 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6832 register const Py_UNICODE *e;
6833
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006834 /* Shortcut for single character strings */
6835 if (PyUnicode_GET_SIZE(self) == 1 &&
6836 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006837 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006838
6839 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006840 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006841 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006842
6843 e = p + PyUnicode_GET_SIZE(self);
6844 for (; p < e; p++) {
6845 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006846 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006847 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006848 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006849}
6850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006851PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006852"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006853\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006854Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006855and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006856
6857static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006858unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006859{
6860 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6861 register const Py_UNICODE *e;
6862
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006863 /* Shortcut for single character strings */
6864 if (PyUnicode_GET_SIZE(self) == 1 &&
6865 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006866 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006867
6868 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006869 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006870 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006871
6872 e = p + PyUnicode_GET_SIZE(self);
6873 for (; p < e; p++) {
6874 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006875 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006876 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006877 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006878}
6879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006880PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006881"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006883Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006884False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885
6886static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006887unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888{
6889 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6890 register const Py_UNICODE *e;
6891
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 /* Shortcut for single character strings */
6893 if (PyUnicode_GET_SIZE(self) == 1 &&
6894 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006895 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006897 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006898 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006899 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006900
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 e = p + PyUnicode_GET_SIZE(self);
6902 for (; p < e; p++) {
6903 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006904 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006906 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907}
6908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006909PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006910"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006912Return True if all characters in S are digits\n\
6913and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914
6915static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006916unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917{
6918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6919 register const Py_UNICODE *e;
6920
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921 /* Shortcut for single character strings */
6922 if (PyUnicode_GET_SIZE(self) == 1 &&
6923 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006924 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006926 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006927 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006928 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006929
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 e = p + PyUnicode_GET_SIZE(self);
6931 for (; p < e; p++) {
6932 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006933 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006935 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936}
6937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006938PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006939"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006941Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006942False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943
6944static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006945unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946{
6947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6948 register const Py_UNICODE *e;
6949
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 /* Shortcut for single character strings */
6951 if (PyUnicode_GET_SIZE(self) == 1 &&
6952 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006953 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006955 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006956 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006957 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006958
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 e = p + PyUnicode_GET_SIZE(self);
6960 for (; p < e; p++) {
6961 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006962 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006964 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965}
6966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006967PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968"S.join(sequence) -> unicode\n\
6969\n\
6970Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006971sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972
6973static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006974unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006976 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977}
6978
Martin v. Löwis18e16552006-02-15 17:27:45 +00006979static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980unicode_length(PyUnicodeObject *self)
6981{
6982 return self->length;
6983}
6984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006985PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006986"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987\n\
6988Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006989done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990
6991static PyObject *
6992unicode_ljust(PyUnicodeObject *self, PyObject *args)
6993{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006994 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006995 Py_UNICODE fillchar = ' ';
6996
Martin v. Löwis412fb672006-04-13 06:34:32 +00006997 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 return NULL;
6999
Tim Peters7a29bd52001-09-12 03:03:31 +00007000 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 Py_INCREF(self);
7002 return (PyObject*) self;
7003 }
7004
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007005 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006}
7007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007008PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009"S.lower() -> unicode\n\
7010\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007011Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012
7013static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007014unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016 return fixup(self, fixlower);
7017}
7018
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007019#define LEFTSTRIP 0
7020#define RIGHTSTRIP 1
7021#define BOTHSTRIP 2
7022
7023/* Arrays indexed by above */
7024static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7025
7026#define STRIPNAME(i) (stripformat[i]+3)
7027
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007028/* externally visible for str.strip(unicode) */
7029PyObject *
7030_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7031{
7032 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007033 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007034 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007035 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7036 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007037
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007038 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7039
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007040 i = 0;
7041 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007042 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7043 i++;
7044 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007045 }
7046
7047 j = len;
7048 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007049 do {
7050 j--;
7051 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7052 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007053 }
7054
7055 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007056 Py_INCREF(self);
7057 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007058 }
7059 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007060 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007061}
7062
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063
7064static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007065do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007067 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007068 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007069
7070 i = 0;
7071 if (striptype != RIGHTSTRIP) {
7072 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7073 i++;
7074 }
7075 }
7076
7077 j = len;
7078 if (striptype != LEFTSTRIP) {
7079 do {
7080 j--;
7081 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7082 j++;
7083 }
7084
7085 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7086 Py_INCREF(self);
7087 return (PyObject*)self;
7088 }
7089 else
7090 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091}
7092
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007093
7094static PyObject *
7095do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7096{
7097 PyObject *sep = NULL;
7098
7099 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7100 return NULL;
7101
7102 if (sep != NULL && sep != Py_None) {
7103 if (PyUnicode_Check(sep))
7104 return _PyUnicode_XStrip(self, striptype, sep);
7105 else if (PyString_Check(sep)) {
7106 PyObject *res;
7107 sep = PyUnicode_FromObject(sep);
7108 if (sep==NULL)
7109 return NULL;
7110 res = _PyUnicode_XStrip(self, striptype, sep);
7111 Py_DECREF(sep);
7112 return res;
7113 }
7114 else {
7115 PyErr_Format(PyExc_TypeError,
7116 "%s arg must be None, unicode or str",
7117 STRIPNAME(striptype));
7118 return NULL;
7119 }
7120 }
7121
7122 return do_strip(self, striptype);
7123}
7124
7125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007126PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007127"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007128\n\
7129Return a copy of the string S with leading and trailing\n\
7130whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007131If chars is given and not None, remove characters in chars instead.\n\
7132If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007133
7134static PyObject *
7135unicode_strip(PyUnicodeObject *self, PyObject *args)
7136{
7137 if (PyTuple_GET_SIZE(args) == 0)
7138 return do_strip(self, BOTHSTRIP); /* Common case */
7139 else
7140 return do_argstrip(self, BOTHSTRIP, args);
7141}
7142
7143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007144PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007145"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007146\n\
7147Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007148If chars is given and not None, remove characters in chars instead.\n\
7149If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007150
7151static PyObject *
7152unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7153{
7154 if (PyTuple_GET_SIZE(args) == 0)
7155 return do_strip(self, LEFTSTRIP); /* Common case */
7156 else
7157 return do_argstrip(self, LEFTSTRIP, args);
7158}
7159
7160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007161PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00007162"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007163\n\
7164Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007165If chars is given and not None, remove characters in chars instead.\n\
7166If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007167
7168static PyObject *
7169unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7170{
7171 if (PyTuple_GET_SIZE(args) == 0)
7172 return do_strip(self, RIGHTSTRIP); /* Common case */
7173 else
7174 return do_argstrip(self, RIGHTSTRIP, args);
7175}
7176
7177
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007179unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180{
7181 PyUnicodeObject *u;
7182 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007183 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007184 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185
7186 if (len < 0)
7187 len = 0;
7188
Tim Peters7a29bd52001-09-12 03:03:31 +00007189 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 /* no repeat, return original string */
7191 Py_INCREF(str);
7192 return (PyObject*) str;
7193 }
Tim Peters8f422462000-09-09 06:13:41 +00007194
7195 /* ensure # of chars needed doesn't overflow int and # of bytes
7196 * needed doesn't overflow size_t
7197 */
7198 nchars = len * str->length;
7199 if (len && nchars / len != str->length) {
7200 PyErr_SetString(PyExc_OverflowError,
7201 "repeated string is too long");
7202 return NULL;
7203 }
7204 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7205 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7206 PyErr_SetString(PyExc_OverflowError,
7207 "repeated string is too long");
7208 return NULL;
7209 }
7210 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 if (!u)
7212 return NULL;
7213
7214 p = u->str;
7215
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007216 if (str->length == 1 && len > 0) {
7217 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007218 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00007219 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007220 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007221 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007222 done = str->length;
7223 }
7224 while (done < nchars) {
7225 int n = (done <= nchars-done) ? done : nchars-done;
7226 Py_UNICODE_COPY(p+done, p, n);
7227 done += n;
7228 }
7229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230
7231 return (PyObject*) u;
7232}
7233
7234PyObject *PyUnicode_Replace(PyObject *obj,
7235 PyObject *subobj,
7236 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007237 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238{
7239 PyObject *self;
7240 PyObject *str1;
7241 PyObject *str2;
7242 PyObject *result;
7243
7244 self = PyUnicode_FromObject(obj);
7245 if (self == NULL)
7246 return NULL;
7247 str1 = PyUnicode_FromObject(subobj);
7248 if (str1 == NULL) {
7249 Py_DECREF(self);
7250 return NULL;
7251 }
7252 str2 = PyUnicode_FromObject(replobj);
7253 if (str2 == NULL) {
7254 Py_DECREF(self);
7255 Py_DECREF(str1);
7256 return NULL;
7257 }
Tim Petersced69f82003-09-16 20:30:58 +00007258 result = replace((PyUnicodeObject *)self,
7259 (PyUnicodeObject *)str1,
7260 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261 maxcount);
7262 Py_DECREF(self);
7263 Py_DECREF(str1);
7264 Py_DECREF(str2);
7265 return result;
7266}
7267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007268PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269"S.replace (old, new[, maxsplit]) -> unicode\n\
7270\n\
7271Return a copy of S with all occurrences of substring\n\
7272old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007273given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274
7275static PyObject*
7276unicode_replace(PyUnicodeObject *self, PyObject *args)
7277{
7278 PyUnicodeObject *str1;
7279 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007280 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281 PyObject *result;
7282
Martin v. Löwis18e16552006-02-15 17:27:45 +00007283 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284 return NULL;
7285 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7286 if (str1 == NULL)
7287 return NULL;
7288 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007289 if (str2 == NULL) {
7290 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293
7294 result = replace(self, str1, str2, maxcount);
7295
7296 Py_DECREF(str1);
7297 Py_DECREF(str2);
7298 return result;
7299}
7300
7301static
7302PyObject *unicode_repr(PyObject *unicode)
7303{
7304 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7305 PyUnicode_GET_SIZE(unicode),
7306 1);
7307}
7308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007309PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310"S.rfind(sub [,start [,end]]) -> int\n\
7311\n\
7312Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007313such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314arguments start and end are interpreted as in slice notation.\n\
7315\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007316Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317
7318static PyObject *
7319unicode_rfind(PyUnicodeObject *self, PyObject *args)
7320{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007321 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007322 Py_ssize_t start;
7323 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007324 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325
Facundo Batista57d56692007-11-16 18:04:14 +00007326 if (!_ParseTupleFinds(args, &substring, &start, &end))
7327 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007329 result = stringlib_rfind_slice(
7330 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7331 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7332 start, end
7333 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334
7335 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007336
7337 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338}
7339
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007340PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341"S.rindex(sub [,start [,end]]) -> int\n\
7342\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007343Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344
7345static PyObject *
7346unicode_rindex(PyUnicodeObject *self, PyObject *args)
7347{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007348 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007349 Py_ssize_t start;
7350 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007351 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352
Facundo Batista57d56692007-11-16 18:04:14 +00007353 if (!_ParseTupleFinds(args, &substring, &start, &end))
7354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007356 result = stringlib_rfind_slice(
7357 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7358 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7359 start, end
7360 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361
7362 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007363
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364 if (result < 0) {
7365 PyErr_SetString(PyExc_ValueError, "substring not found");
7366 return NULL;
7367 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007368 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369}
7370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007371PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007372"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373\n\
7374Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007375done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376
7377static PyObject *
7378unicode_rjust(PyUnicodeObject *self, PyObject *args)
7379{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007380 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007381 Py_UNICODE fillchar = ' ';
7382
Martin v. Löwis412fb672006-04-13 06:34:32 +00007383 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384 return NULL;
7385
Tim Peters7a29bd52001-09-12 03:03:31 +00007386 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387 Py_INCREF(self);
7388 return (PyObject*) self;
7389 }
7390
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007391 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392}
7393
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007395unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396{
7397 /* standard clamping */
7398 if (start < 0)
7399 start = 0;
7400 if (end < 0)
7401 end = 0;
7402 if (end > self->length)
7403 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007404 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405 /* full slice, return original string */
7406 Py_INCREF(self);
7407 return (PyObject*) self;
7408 }
7409 if (start > end)
7410 start = end;
7411 /* copy slice */
7412 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7413 end - start);
7414}
7415
7416PyObject *PyUnicode_Split(PyObject *s,
7417 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007418 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419{
7420 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007421
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422 s = PyUnicode_FromObject(s);
7423 if (s == NULL)
7424 return NULL;
7425 if (sep != NULL) {
7426 sep = PyUnicode_FromObject(sep);
7427 if (sep == NULL) {
7428 Py_DECREF(s);
7429 return NULL;
7430 }
7431 }
7432
7433 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7434
7435 Py_DECREF(s);
7436 Py_XDECREF(sep);
7437 return result;
7438}
7439
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007440PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441"S.split([sep [,maxsplit]]) -> list of strings\n\
7442\n\
7443Return a list of the words in S, using sep as the\n\
7444delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007445splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007446any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447
7448static PyObject*
7449unicode_split(PyUnicodeObject *self, PyObject *args)
7450{
7451 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007452 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453
Martin v. Löwis18e16552006-02-15 17:27:45 +00007454 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 return NULL;
7456
7457 if (substring == Py_None)
7458 return split(self, NULL, maxcount);
7459 else if (PyUnicode_Check(substring))
7460 return split(self, (PyUnicodeObject *)substring, maxcount);
7461 else
7462 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7463}
7464
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007465PyObject *
7466PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7467{
7468 PyObject* str_obj;
7469 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007470 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007471
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007472 str_obj = PyUnicode_FromObject(str_in);
7473 if (!str_obj)
7474 return NULL;
7475 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007476 if (!sep_obj) {
7477 Py_DECREF(str_obj);
7478 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007479 }
7480
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007481 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007482 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7483 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7484 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007485
Fredrik Lundhb9479482006-05-26 17:22:38 +00007486 Py_DECREF(sep_obj);
7487 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007488
7489 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007490}
7491
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007492
7493PyObject *
7494PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7495{
7496 PyObject* str_obj;
7497 PyObject* sep_obj;
7498 PyObject* out;
7499
7500 str_obj = PyUnicode_FromObject(str_in);
7501 if (!str_obj)
7502 return NULL;
7503 sep_obj = PyUnicode_FromObject(sep_in);
7504 if (!sep_obj) {
7505 Py_DECREF(str_obj);
7506 return NULL;
7507 }
7508
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007509 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007510 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7511 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7512 );
7513
7514 Py_DECREF(sep_obj);
7515 Py_DECREF(str_obj);
7516
7517 return out;
7518}
7519
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007520PyDoc_STRVAR(partition__doc__,
7521"S.partition(sep) -> (head, sep, tail)\n\
7522\n\
7523Searches for the separator sep in S, and returns the part before it,\n\
7524the separator itself, and the part after it. If the separator is not\n\
7525found, returns S and two empty strings.");
7526
7527static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007528unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007529{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007530 return PyUnicode_Partition((PyObject *)self, separator);
7531}
7532
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007533PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007534"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007535\n\
7536Searches for the separator sep in S, starting at the end of S, and returns\n\
7537the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007538separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007539
7540static PyObject*
7541unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7542{
7543 return PyUnicode_RPartition((PyObject *)self, separator);
7544}
7545
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007546PyObject *PyUnicode_RSplit(PyObject *s,
7547 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007548 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007549{
7550 PyObject *result;
7551
7552 s = PyUnicode_FromObject(s);
7553 if (s == NULL)
7554 return NULL;
7555 if (sep != NULL) {
7556 sep = PyUnicode_FromObject(sep);
7557 if (sep == NULL) {
7558 Py_DECREF(s);
7559 return NULL;
7560 }
7561 }
7562
7563 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7564
7565 Py_DECREF(s);
7566 Py_XDECREF(sep);
7567 return result;
7568}
7569
7570PyDoc_STRVAR(rsplit__doc__,
7571"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7572\n\
7573Return a list of the words in S, using sep as the\n\
7574delimiter string, starting at the end of the string and\n\
7575working to the front. If maxsplit is given, at most maxsplit\n\
7576splits are done. If sep is not specified, any whitespace string\n\
7577is a separator.");
7578
7579static PyObject*
7580unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7581{
7582 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007583 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007584
Martin v. Löwis18e16552006-02-15 17:27:45 +00007585 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007586 return NULL;
7587
7588 if (substring == Py_None)
7589 return rsplit(self, NULL, maxcount);
7590 else if (PyUnicode_Check(substring))
7591 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7592 else
7593 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7594}
7595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007596PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007597"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598\n\
7599Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007600Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007601is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602
7603static PyObject*
7604unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7605{
Guido van Rossum86662912000-04-11 15:38:46 +00007606 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607
Guido van Rossum86662912000-04-11 15:38:46 +00007608 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609 return NULL;
7610
Guido van Rossum86662912000-04-11 15:38:46 +00007611 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612}
7613
7614static
7615PyObject *unicode_str(PyUnicodeObject *self)
7616{
Fred Drakee4315f52000-05-09 19:53:39 +00007617 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618}
7619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007620PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621"S.swapcase() -> unicode\n\
7622\n\
7623Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007624and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625
7626static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007627unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629 return fixup(self, fixswapcase);
7630}
7631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007632PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633"S.translate(table) -> unicode\n\
7634\n\
7635Return a copy of the string S, where all characters have been mapped\n\
7636through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007637Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7638Unmapped characters are left untouched. Characters mapped to None\n\
7639are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640
7641static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007642unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643{
Tim Petersced69f82003-09-16 20:30:58 +00007644 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007646 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647 "ignore");
7648}
7649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007650PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651"S.upper() -> unicode\n\
7652\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007653Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654
7655static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007656unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658 return fixup(self, fixupper);
7659}
7660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007661PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662"S.zfill(width) -> unicode\n\
7663\n\
7664Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007665of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666
7667static PyObject *
7668unicode_zfill(PyUnicodeObject *self, PyObject *args)
7669{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007670 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671 PyUnicodeObject *u;
7672
Martin v. Löwis18e16552006-02-15 17:27:45 +00007673 Py_ssize_t width;
7674 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675 return NULL;
7676
7677 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007678 if (PyUnicode_CheckExact(self)) {
7679 Py_INCREF(self);
7680 return (PyObject*) self;
7681 }
7682 else
7683 return PyUnicode_FromUnicode(
7684 PyUnicode_AS_UNICODE(self),
7685 PyUnicode_GET_SIZE(self)
7686 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687 }
7688
7689 fill = width - self->length;
7690
7691 u = pad(self, fill, 0, '0');
7692
Walter Dörwald068325e2002-04-15 13:36:47 +00007693 if (u == NULL)
7694 return NULL;
7695
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696 if (u->str[fill] == '+' || u->str[fill] == '-') {
7697 /* move sign to beginning of string */
7698 u->str[0] = u->str[fill];
7699 u->str[fill] = '0';
7700 }
7701
7702 return (PyObject*) u;
7703}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704
7705#if 0
7706static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007707free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007709 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710}
7711#endif
7712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007713PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007714"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007716Return True if S starts with the specified prefix, False otherwise.\n\
7717With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007718With optional end, stop comparing S at that position.\n\
7719prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720
7721static PyObject *
7722unicode_startswith(PyUnicodeObject *self,
7723 PyObject *args)
7724{
Georg Brandl24250812006-06-09 18:45:48 +00007725 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007727 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007728 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007729 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730
Georg Brandl24250812006-06-09 18:45:48 +00007731 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007732 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007734 if (PyTuple_Check(subobj)) {
7735 Py_ssize_t i;
7736 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7737 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7738 PyTuple_GET_ITEM(subobj, i));
7739 if (substring == NULL)
7740 return NULL;
7741 result = tailmatch(self, substring, start, end, -1);
7742 Py_DECREF(substring);
7743 if (result) {
7744 Py_RETURN_TRUE;
7745 }
7746 }
7747 /* nothing matched */
7748 Py_RETURN_FALSE;
7749 }
7750 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007752 return NULL;
7753 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007755 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756}
7757
7758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007759PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007760"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007762Return True if S ends with the specified suffix, False otherwise.\n\
7763With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007764With optional end, stop comparing S at that position.\n\
7765suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766
7767static PyObject *
7768unicode_endswith(PyUnicodeObject *self,
7769 PyObject *args)
7770{
Georg Brandl24250812006-06-09 18:45:48 +00007771 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007773 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007774 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007775 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776
Georg Brandl24250812006-06-09 18:45:48 +00007777 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7778 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007780 if (PyTuple_Check(subobj)) {
7781 Py_ssize_t i;
7782 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7783 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7784 PyTuple_GET_ITEM(subobj, i));
7785 if (substring == NULL)
7786 return NULL;
7787 result = tailmatch(self, substring, start, end, +1);
7788 Py_DECREF(substring);
7789 if (result) {
7790 Py_RETURN_TRUE;
7791 }
7792 }
7793 Py_RETURN_FALSE;
7794 }
7795 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007797 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798
Georg Brandl24250812006-06-09 18:45:48 +00007799 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007801 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802}
7803
7804
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007805
7806static PyObject *
7807unicode_getnewargs(PyUnicodeObject *v)
7808{
7809 return Py_BuildValue("(u#)", v->str, v->length);
7810}
7811
7812
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813static PyMethodDef unicode_methods[] = {
7814
7815 /* Order is according to common usage: often used methods should
7816 appear first, since lookup is done sequentially. */
7817
Georg Brandlecdc0a92006-03-30 12:19:07 +00007818 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007819 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7820 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007821 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007822 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7823 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7824 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7825 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7826 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7827 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7828 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007829 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007830 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7831 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7832 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007833 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007834 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007835/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7836 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7837 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7838 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007839 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007840 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007841 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007842 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007843 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7844 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7845 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7846 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7847 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7848 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7849 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7850 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7851 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7852 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7853 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7854 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7855 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7856 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007857 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007858#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007859 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860#endif
7861
7862#if 0
7863 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007864 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865#endif
7866
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007867 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 {NULL, NULL}
7869};
7870
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007871static PyObject *
7872unicode_mod(PyObject *v, PyObject *w)
7873{
7874 if (!PyUnicode_Check(v)) {
7875 Py_INCREF(Py_NotImplemented);
7876 return Py_NotImplemented;
7877 }
7878 return PyUnicode_Format(v, w);
7879}
7880
7881static PyNumberMethods unicode_as_number = {
7882 0, /*nb_add*/
7883 0, /*nb_subtract*/
7884 0, /*nb_multiply*/
7885 0, /*nb_divide*/
7886 unicode_mod, /*nb_remainder*/
7887};
7888
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007890 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007891 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007892 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7893 (ssizeargfunc) unicode_getitem, /* sq_item */
7894 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895 0, /* sq_ass_item */
7896 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007897 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898};
7899
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007900static PyObject*
7901unicode_subscript(PyUnicodeObject* self, PyObject* item)
7902{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007903 if (PyIndex_Check(item)) {
7904 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007905 if (i == -1 && PyErr_Occurred())
7906 return NULL;
7907 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007908 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007909 return unicode_getitem(self, i);
7910 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007911 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007912 Py_UNICODE* source_buf;
7913 Py_UNICODE* result_buf;
7914 PyObject* result;
7915
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007916 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007917 &start, &stop, &step, &slicelength) < 0) {
7918 return NULL;
7919 }
7920
7921 if (slicelength <= 0) {
7922 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007923 } else if (start == 0 && step == 1 && slicelength == self->length &&
7924 PyUnicode_CheckExact(self)) {
7925 Py_INCREF(self);
7926 return (PyObject *)self;
7927 } else if (step == 1) {
7928 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007929 } else {
7930 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00007931 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7932 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007933
7934 if (result_buf == NULL)
7935 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007936
7937 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7938 result_buf[i] = source_buf[cur];
7939 }
Tim Petersced69f82003-09-16 20:30:58 +00007940
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007941 result = PyUnicode_FromUnicode(result_buf, slicelength);
7942 PyMem_FREE(result_buf);
7943 return result;
7944 }
7945 } else {
7946 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7947 return NULL;
7948 }
7949}
7950
7951static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007952 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007953 (binaryfunc)unicode_subscript, /* mp_subscript */
7954 (objobjargproc)0, /* mp_ass_subscript */
7955};
7956
Martin v. Löwis18e16552006-02-15 17:27:45 +00007957static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007959 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960 const void **ptr)
7961{
7962 if (index != 0) {
7963 PyErr_SetString(PyExc_SystemError,
7964 "accessing non-existent unicode segment");
7965 return -1;
7966 }
7967 *ptr = (void *) self->str;
7968 return PyUnicode_GET_DATA_SIZE(self);
7969}
7970
Martin v. Löwis18e16552006-02-15 17:27:45 +00007971static Py_ssize_t
7972unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 const void **ptr)
7974{
7975 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007976 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977 return -1;
7978}
7979
7980static int
7981unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007982 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983{
7984 if (lenp)
7985 *lenp = PyUnicode_GET_DATA_SIZE(self);
7986 return 1;
7987}
7988
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007989static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007991 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992 const void **ptr)
7993{
7994 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007995
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 if (index != 0) {
7997 PyErr_SetString(PyExc_SystemError,
7998 "accessing non-existent unicode segment");
7999 return -1;
8000 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008001 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 if (str == NULL)
8003 return -1;
8004 *ptr = (void *) PyString_AS_STRING(str);
8005 return PyString_GET_SIZE(str);
8006}
8007
8008/* Helpers for PyUnicode_Format() */
8009
8010static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008011getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008013 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 if (argidx < arglen) {
8015 (*p_argidx)++;
8016 if (arglen < 0)
8017 return args;
8018 else
8019 return PyTuple_GetItem(args, argidx);
8020 }
8021 PyErr_SetString(PyExc_TypeError,
8022 "not enough arguments for format string");
8023 return NULL;
8024}
8025
8026#define F_LJUST (1<<0)
8027#define F_SIGN (1<<1)
8028#define F_BLANK (1<<2)
8029#define F_ALT (1<<3)
8030#define F_ZERO (1<<4)
8031
Martin v. Löwis18e16552006-02-15 17:27:45 +00008032static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008033strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008035 register Py_ssize_t i;
8036 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037 for (i = len - 1; i >= 0; i--)
8038 buffer[i] = (Py_UNICODE) charbuffer[i];
8039
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 return len;
8041}
8042
Neal Norwitzfc76d632006-01-10 06:03:13 +00008043static int
8044doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8045{
Tim Peters15231542006-02-16 01:08:01 +00008046 Py_ssize_t result;
8047
Neal Norwitzfc76d632006-01-10 06:03:13 +00008048 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008049 result = strtounicode(buffer, (char *)buffer);
8050 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008051}
8052
8053static int
8054longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8055{
Tim Peters15231542006-02-16 01:08:01 +00008056 Py_ssize_t result;
8057
Neal Norwitzfc76d632006-01-10 06:03:13 +00008058 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008059 result = strtounicode(buffer, (char *)buffer);
8060 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008061}
8062
Guido van Rossum078151d2002-08-11 04:24:12 +00008063/* XXX To save some code duplication, formatfloat/long/int could have been
8064 shared with stringobject.c, converting from 8-bit to Unicode after the
8065 formatting is done. */
8066
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067static int
8068formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008069 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070 int flags,
8071 int prec,
8072 int type,
8073 PyObject *v)
8074{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008075 /* fmt = '%#.' + `prec` + `type`
8076 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077 char fmt[20];
8078 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008079
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080 x = PyFloat_AsDouble(v);
8081 if (x == -1.0 && PyErr_Occurred())
8082 return -1;
8083 if (prec < 0)
8084 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8086 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008087 /* Worst case length calc to ensure no buffer overrun:
8088
8089 'g' formats:
8090 fmt = %#.<prec>g
8091 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8092 for any double rep.)
8093 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8094
8095 'f' formats:
8096 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8097 len = 1 + 50 + 1 + prec = 52 + prec
8098
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008099 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008100 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008101
8102 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00008103 if (((type == 'g' || type == 'G') &&
8104 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008105 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008106 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008107 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008108 return -1;
8109 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008110 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8111 (flags&F_ALT) ? "#" : "",
8112 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008113 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114}
8115
Tim Peters38fd5b62000-09-21 05:43:11 +00008116static PyObject*
8117formatlong(PyObject *val, int flags, int prec, int type)
8118{
8119 char *buf;
8120 int i, len;
8121 PyObject *str; /* temporary string object. */
8122 PyUnicodeObject *result;
8123
8124 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8125 if (!str)
8126 return NULL;
8127 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008128 if (!result) {
8129 Py_DECREF(str);
8130 return NULL;
8131 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008132 for (i = 0; i < len; i++)
8133 result->str[i] = buf[i];
8134 result->str[len] = 0;
8135 Py_DECREF(str);
8136 return (PyObject*)result;
8137}
8138
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139static int
8140formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008141 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142 int flags,
8143 int prec,
8144 int type,
8145 PyObject *v)
8146{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008147 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008148 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8149 * + 1 + 1
8150 * = 24
8151 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008152 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008153 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154 long x;
8155
8156 x = PyInt_AsLong(v);
8157 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008158 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008159 if (x < 0 && type == 'u') {
8160 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008161 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008162 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8163 sign = "-";
8164 else
8165 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008167 prec = 1;
8168
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008169 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8170 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008171 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008172 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008173 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008174 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008175 return -1;
8176 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008177
8178 if ((flags & F_ALT) &&
8179 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008180 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008181 * of issues that cause pain:
8182 * - when 0 is being converted, the C standard leaves off
8183 * the '0x' or '0X', which is inconsistent with other
8184 * %#x/%#X conversions and inconsistent with Python's
8185 * hex() function
8186 * - there are platforms that violate the standard and
8187 * convert 0 with the '0x' or '0X'
8188 * (Metrowerks, Compaq Tru64)
8189 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008190 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008191 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008192 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008193 * We can achieve the desired consistency by inserting our
8194 * own '0x' or '0X' prefix, and substituting %x/%X in place
8195 * of %#x/%#X.
8196 *
8197 * Note that this is the same approach as used in
8198 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008199 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008200 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8201 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008202 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008203 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008204 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8205 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008206 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008207 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008208 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008209 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008210 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008211 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212}
8213
8214static int
8215formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008216 size_t buflen,
8217 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008219 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008220 if (PyUnicode_Check(v)) {
8221 if (PyUnicode_GET_SIZE(v) != 1)
8222 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008224 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008226 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008227 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008228 goto onError;
8229 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231
8232 else {
8233 /* Integer input truncated to a character */
8234 long x;
8235 x = PyInt_AsLong(v);
8236 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008237 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008238#ifdef Py_UNICODE_WIDE
8239 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008240 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008241 "%c arg not in range(0x110000) "
8242 "(wide Python build)");
8243 return -1;
8244 }
8245#else
8246 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008247 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008248 "%c arg not in range(0x10000) "
8249 "(narrow Python build)");
8250 return -1;
8251 }
8252#endif
8253 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254 }
8255 buf[1] = '\0';
8256 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008257
8258 onError:
8259 PyErr_SetString(PyExc_TypeError,
8260 "%c requires int or char");
8261 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262}
8263
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008264/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8265
8266 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8267 chars are formatted. XXX This is a magic number. Each formatting
8268 routine does bounds checking to ensure no overflow, but a better
8269 solution may be to malloc a buffer of appropriate size for each
8270 format. For now, the current solution is sufficient.
8271*/
8272#define FORMATBUFLEN (size_t)120
8273
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274PyObject *PyUnicode_Format(PyObject *format,
8275 PyObject *args)
8276{
8277 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008278 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 int args_owned = 0;
8280 PyUnicodeObject *result = NULL;
8281 PyObject *dict = NULL;
8282 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008283
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284 if (format == NULL || args == NULL) {
8285 PyErr_BadInternalCall();
8286 return NULL;
8287 }
8288 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008289 if (uformat == NULL)
8290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291 fmt = PyUnicode_AS_UNICODE(uformat);
8292 fmtcnt = PyUnicode_GET_SIZE(uformat);
8293
8294 reslen = rescnt = fmtcnt + 100;
8295 result = _PyUnicode_New(reslen);
8296 if (result == NULL)
8297 goto onError;
8298 res = PyUnicode_AS_UNICODE(result);
8299
8300 if (PyTuple_Check(args)) {
8301 arglen = PyTuple_Size(args);
8302 argidx = 0;
8303 }
8304 else {
8305 arglen = -1;
8306 argidx = -2;
8307 }
Christian Heimese93237d2007-12-19 02:37:44 +00008308 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008309 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310 dict = args;
8311
8312 while (--fmtcnt >= 0) {
8313 if (*fmt != '%') {
8314 if (--rescnt < 0) {
8315 rescnt = fmtcnt + 100;
8316 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008317 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008318 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8320 --rescnt;
8321 }
8322 *res++ = *fmt++;
8323 }
8324 else {
8325 /* Got a format specifier */
8326 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008327 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329 Py_UNICODE c = '\0';
8330 Py_UNICODE fill;
8331 PyObject *v = NULL;
8332 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008333 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008335 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008336 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337
8338 fmt++;
8339 if (*fmt == '(') {
8340 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008341 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 PyObject *key;
8343 int pcount = 1;
8344
8345 if (dict == NULL) {
8346 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008347 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 goto onError;
8349 }
8350 ++fmt;
8351 --fmtcnt;
8352 keystart = fmt;
8353 /* Skip over balanced parentheses */
8354 while (pcount > 0 && --fmtcnt >= 0) {
8355 if (*fmt == ')')
8356 --pcount;
8357 else if (*fmt == '(')
8358 ++pcount;
8359 fmt++;
8360 }
8361 keylen = fmt - keystart - 1;
8362 if (fmtcnt < 0 || pcount > 0) {
8363 PyErr_SetString(PyExc_ValueError,
8364 "incomplete format key");
8365 goto onError;
8366 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008367#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008368 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369 then looked up since Python uses strings to hold
8370 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008371 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372 key = PyUnicode_EncodeUTF8(keystart,
8373 keylen,
8374 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008375#else
8376 key = PyUnicode_FromUnicode(keystart, keylen);
8377#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 if (key == NULL)
8379 goto onError;
8380 if (args_owned) {
8381 Py_DECREF(args);
8382 args_owned = 0;
8383 }
8384 args = PyObject_GetItem(dict, key);
8385 Py_DECREF(key);
8386 if (args == NULL) {
8387 goto onError;
8388 }
8389 args_owned = 1;
8390 arglen = -1;
8391 argidx = -2;
8392 }
8393 while (--fmtcnt >= 0) {
8394 switch (c = *fmt++) {
8395 case '-': flags |= F_LJUST; continue;
8396 case '+': flags |= F_SIGN; continue;
8397 case ' ': flags |= F_BLANK; continue;
8398 case '#': flags |= F_ALT; continue;
8399 case '0': flags |= F_ZERO; continue;
8400 }
8401 break;
8402 }
8403 if (c == '*') {
8404 v = getnextarg(args, arglen, &argidx);
8405 if (v == NULL)
8406 goto onError;
8407 if (!PyInt_Check(v)) {
8408 PyErr_SetString(PyExc_TypeError,
8409 "* wants int");
8410 goto onError;
8411 }
8412 width = PyInt_AsLong(v);
8413 if (width < 0) {
8414 flags |= F_LJUST;
8415 width = -width;
8416 }
8417 if (--fmtcnt >= 0)
8418 c = *fmt++;
8419 }
8420 else if (c >= '0' && c <= '9') {
8421 width = c - '0';
8422 while (--fmtcnt >= 0) {
8423 c = *fmt++;
8424 if (c < '0' || c > '9')
8425 break;
8426 if ((width*10) / 10 != width) {
8427 PyErr_SetString(PyExc_ValueError,
8428 "width too big");
8429 goto onError;
8430 }
8431 width = width*10 + (c - '0');
8432 }
8433 }
8434 if (c == '.') {
8435 prec = 0;
8436 if (--fmtcnt >= 0)
8437 c = *fmt++;
8438 if (c == '*') {
8439 v = getnextarg(args, arglen, &argidx);
8440 if (v == NULL)
8441 goto onError;
8442 if (!PyInt_Check(v)) {
8443 PyErr_SetString(PyExc_TypeError,
8444 "* wants int");
8445 goto onError;
8446 }
8447 prec = PyInt_AsLong(v);
8448 if (prec < 0)
8449 prec = 0;
8450 if (--fmtcnt >= 0)
8451 c = *fmt++;
8452 }
8453 else if (c >= '0' && c <= '9') {
8454 prec = c - '0';
8455 while (--fmtcnt >= 0) {
8456 c = Py_CHARMASK(*fmt++);
8457 if (c < '0' || c > '9')
8458 break;
8459 if ((prec*10) / 10 != prec) {
8460 PyErr_SetString(PyExc_ValueError,
8461 "prec too big");
8462 goto onError;
8463 }
8464 prec = prec*10 + (c - '0');
8465 }
8466 }
8467 } /* prec */
8468 if (fmtcnt >= 0) {
8469 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470 if (--fmtcnt >= 0)
8471 c = *fmt++;
8472 }
8473 }
8474 if (fmtcnt < 0) {
8475 PyErr_SetString(PyExc_ValueError,
8476 "incomplete format");
8477 goto onError;
8478 }
8479 if (c != '%') {
8480 v = getnextarg(args, arglen, &argidx);
8481 if (v == NULL)
8482 goto onError;
8483 }
8484 sign = 0;
8485 fill = ' ';
8486 switch (c) {
8487
8488 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008489 pbuf = formatbuf;
8490 /* presume that buffer length is at least 1 */
8491 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492 len = 1;
8493 break;
8494
8495 case 's':
8496 case 'r':
8497 if (PyUnicode_Check(v) && c == 's') {
8498 temp = v;
8499 Py_INCREF(temp);
8500 }
8501 else {
8502 PyObject *unicode;
8503 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008504 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505 else
8506 temp = PyObject_Repr(v);
8507 if (temp == NULL)
8508 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008509 if (PyUnicode_Check(temp))
8510 /* nothing to do */;
8511 else if (PyString_Check(temp)) {
8512 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008513 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008515 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008517 Py_DECREF(temp);
8518 temp = unicode;
8519 if (temp == NULL)
8520 goto onError;
8521 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008522 else {
8523 Py_DECREF(temp);
8524 PyErr_SetString(PyExc_TypeError,
8525 "%s argument has non-string str()");
8526 goto onError;
8527 }
8528 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008529 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530 len = PyUnicode_GET_SIZE(temp);
8531 if (prec >= 0 && len > prec)
8532 len = prec;
8533 break;
8534
8535 case 'i':
8536 case 'd':
8537 case 'u':
8538 case 'o':
8539 case 'x':
8540 case 'X':
8541 if (c == 'i')
8542 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008543 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008544 temp = formatlong(v, flags, prec, c);
8545 if (!temp)
8546 goto onError;
8547 pbuf = PyUnicode_AS_UNICODE(temp);
8548 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008549 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008551 else {
8552 pbuf = formatbuf;
8553 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8554 flags, prec, c, v);
8555 if (len < 0)
8556 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008557 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008558 }
8559 if (flags & F_ZERO)
8560 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561 break;
8562
8563 case 'e':
8564 case 'E':
8565 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008566 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567 case 'g':
8568 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008569 if (c == 'F')
8570 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008571 pbuf = formatbuf;
8572 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8573 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574 if (len < 0)
8575 goto onError;
8576 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008577 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 fill = '0';
8579 break;
8580
8581 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008582 pbuf = formatbuf;
8583 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 if (len < 0)
8585 goto onError;
8586 break;
8587
8588 default:
8589 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008590 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008591 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008592 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008593 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008594 (Py_ssize_t)(fmt - 1 -
8595 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596 goto onError;
8597 }
8598 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008599 if (*pbuf == '-' || *pbuf == '+') {
8600 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601 len--;
8602 }
8603 else if (flags & F_SIGN)
8604 sign = '+';
8605 else if (flags & F_BLANK)
8606 sign = ' ';
8607 else
8608 sign = 0;
8609 }
8610 if (width < len)
8611 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008612 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 reslen -= rescnt;
8614 rescnt = width + fmtcnt + 100;
8615 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008616 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008617 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008618 PyErr_NoMemory();
8619 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008620 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008621 if (_PyUnicode_Resize(&result, reslen) < 0) {
8622 Py_XDECREF(temp);
8623 goto onError;
8624 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 res = PyUnicode_AS_UNICODE(result)
8626 + reslen - rescnt;
8627 }
8628 if (sign) {
8629 if (fill != ' ')
8630 *res++ = sign;
8631 rescnt--;
8632 if (width > len)
8633 width--;
8634 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008635 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8636 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008637 assert(pbuf[1] == c);
8638 if (fill != ' ') {
8639 *res++ = *pbuf++;
8640 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008641 }
Tim Petersfff53252001-04-12 18:38:48 +00008642 rescnt -= 2;
8643 width -= 2;
8644 if (width < 0)
8645 width = 0;
8646 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008647 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 if (width > len && !(flags & F_LJUST)) {
8649 do {
8650 --rescnt;
8651 *res++ = fill;
8652 } while (--width > len);
8653 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008654 if (fill == ' ') {
8655 if (sign)
8656 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008657 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008658 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008659 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008660 *res++ = *pbuf++;
8661 *res++ = *pbuf++;
8662 }
8663 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008664 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 res += len;
8666 rescnt -= len;
8667 while (--width >= len) {
8668 --rescnt;
8669 *res++ = ' ';
8670 }
8671 if (dict && (argidx < arglen) && c != '%') {
8672 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008673 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008674 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 goto onError;
8676 }
8677 Py_XDECREF(temp);
8678 } /* '%' */
8679 } /* until end */
8680 if (argidx < arglen && !dict) {
8681 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008682 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 goto onError;
8684 }
8685
Thomas Woutersa96affe2006-03-12 00:29:36 +00008686 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8687 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688 if (args_owned) {
8689 Py_DECREF(args);
8690 }
8691 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 return (PyObject *)result;
8693
8694 onError:
8695 Py_XDECREF(result);
8696 Py_DECREF(uformat);
8697 if (args_owned) {
8698 Py_DECREF(args);
8699 }
8700 return NULL;
8701}
8702
8703static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008704 (readbufferproc) unicode_buffer_getreadbuf,
8705 (writebufferproc) unicode_buffer_getwritebuf,
8706 (segcountproc) unicode_buffer_getsegcount,
8707 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708};
8709
Jeremy Hylton938ace62002-07-17 16:30:39 +00008710static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008711unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8712
Tim Peters6d6c1a32001-08-02 04:15:00 +00008713static PyObject *
8714unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8715{
8716 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008717 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008718 char *encoding = NULL;
8719 char *errors = NULL;
8720
Guido van Rossume023fe02001-08-30 03:12:59 +00008721 if (type != &PyUnicode_Type)
8722 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008723 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8724 kwlist, &x, &encoding, &errors))
8725 return NULL;
8726 if (x == NULL)
8727 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008728 if (encoding == NULL && errors == NULL)
8729 return PyObject_Unicode(x);
8730 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008731 return PyUnicode_FromEncodedObject(x, encoding, errors);
8732}
8733
Guido van Rossume023fe02001-08-30 03:12:59 +00008734static PyObject *
8735unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8736{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008737 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008738 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008739
8740 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8741 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8742 if (tmp == NULL)
8743 return NULL;
8744 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008745 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008746 if (pnew == NULL) {
8747 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008748 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008749 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008750 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8751 if (pnew->str == NULL) {
8752 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008753 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008754 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008755 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008756 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008757 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8758 pnew->length = n;
8759 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008760 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008761 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008762}
8763
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008764PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008765"unicode(string [, encoding[, errors]]) -> object\n\
8766\n\
8767Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008768encoding defaults to the current default string encoding.\n\
8769errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008770
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008772 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773 "unicode", /* tp_name */
8774 sizeof(PyUnicodeObject), /* tp_size */
8775 0, /* tp_itemsize */
8776 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008777 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008779 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008781 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008782 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008783 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008785 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786 (hashfunc) unicode_hash, /* tp_hash*/
8787 0, /* tp_call*/
8788 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008789 PyObject_GenericGetAttr, /* tp_getattro */
8790 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008792 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008793 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008794 unicode_doc, /* tp_doc */
8795 0, /* tp_traverse */
8796 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008797 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008798 0, /* tp_weaklistoffset */
8799 0, /* tp_iter */
8800 0, /* tp_iternext */
8801 unicode_methods, /* tp_methods */
8802 0, /* tp_members */
8803 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008804 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008805 0, /* tp_dict */
8806 0, /* tp_descr_get */
8807 0, /* tp_descr_set */
8808 0, /* tp_dictoffset */
8809 0, /* tp_init */
8810 0, /* tp_alloc */
8811 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008812 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813};
8814
8815/* Initialize the Unicode implementation */
8816
Thomas Wouters78890102000-07-22 19:25:51 +00008817void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008819 int i;
8820
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008821 /* XXX - move this array to unicodectype.c ? */
8822 Py_UNICODE linebreak[] = {
8823 0x000A, /* LINE FEED */
8824 0x000D, /* CARRIAGE RETURN */
8825 0x001C, /* FILE SEPARATOR */
8826 0x001D, /* GROUP SEPARATOR */
8827 0x001E, /* RECORD SEPARATOR */
8828 0x0085, /* NEXT LINE */
8829 0x2028, /* LINE SEPARATOR */
8830 0x2029, /* PARAGRAPH SEPARATOR */
8831 };
8832
Fred Drakee4315f52000-05-09 19:53:39 +00008833 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008834 free_list = NULL;
8835 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008837 if (!unicode_empty)
8838 return;
8839
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008840 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008841 for (i = 0; i < 256; i++)
8842 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008843 if (PyType_Ready(&PyUnicode_Type) < 0)
8844 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008845
8846 /* initialize the linebreak bloom filter */
8847 bloom_linebreak = make_bloom_mask(
8848 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8849 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008850
8851 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852}
8853
8854/* Finalize the Unicode implementation */
8855
8856void
Thomas Wouters78890102000-07-22 19:25:51 +00008857_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008859 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008860 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008862 Py_XDECREF(unicode_empty);
8863 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008864
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008865 for (i = 0; i < 256; i++) {
8866 if (unicode_latin1[i]) {
8867 Py_DECREF(unicode_latin1[i]);
8868 unicode_latin1[i] = NULL;
8869 }
8870 }
8871
Christian Heimes5b970ad2008-02-06 13:33:44 +00008872 for (u = free_list; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873 PyUnicodeObject *v = u;
8874 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008875 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008876 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008877 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008878 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879 }
Christian Heimes5b970ad2008-02-06 13:33:44 +00008880 free_list = NULL;
8881 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008883
Anthony Baxterac6bd462006-04-13 02:06:09 +00008884#ifdef __cplusplus
8885}
8886#endif
8887
8888
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008889/*
8890Local variables:
8891c-basic-offset: 4
8892indent-tabs-mode: nil
8893End:
8894*/